-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Inline][PGO] After inline, update InvokeInst profile counts in caller and cloned callee #83809
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-llvm-ir Author: Mingming Liu (minglotus-6) ChangesA related change is https://reviews.llvm.org/D133121, which correctly preserves both branch weights and value profiles for invoke instruction.
Full diff: https://github.com/llvm/llvm-project/pull/83809.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index bc357074e5cb21..1146b3fa3ae244 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -4360,6 +4360,9 @@ class InvokeInst : public CallBase {
unsigned getNumSuccessors() const { return 2; }
+ /// Updates profile metadata by scaling it by \p S / \p T.
+ void updateProfWeight(uint64_t S, uint64_t T);
+
// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const Instruction *I) {
return (I->getOpcode() == Instruction::Invoke);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 9ae71acd523c36..920ce67f118991 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -918,6 +918,18 @@ LandingPadInst *InvokeInst::getLandingPadInst() const {
return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
}
+void InvokeInst::updateProfWeight(uint64_t S, uint64_t T) {
+ if (T == 0) {
+ LLVM_DEBUG(dbgs() << "Attempting to update profile weights will result in "
+ "div by 0. Ignoring. Likely the function "
+ << getParent()->getParent()->getName()
+ << " has 0 entry count, and contains call instructions "
+ "with non-zero prof info.");
+ return;
+ }
+ scaleProfData(*this, S, T);
+}
+
//===----------------------------------------------------------------------===//
// CallBrInst Implementation
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index f68fdb26f28173..75b0d0669e9228 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1909,10 +1909,14 @@ void llvm::updateProfileCallee(
// During inlining ?
if (VMap) {
uint64_t CloneEntryCount = PriorEntryCount - NewEntryCount;
- for (auto Entry : *VMap)
+ for (auto Entry : *VMap) {
if (isa<CallInst>(Entry.first))
if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
CI->updateProfWeight(CloneEntryCount, PriorEntryCount);
+ if (isa<InvokeInst>(Entry.first))
+ if (auto *II = dyn_cast_or_null<InvokeInst>(Entry.second))
+ II->updateProfWeight(CloneEntryCount, PriorEntryCount);
+ }
}
if (EntryDelta) {
@@ -1921,9 +1925,12 @@ void llvm::updateProfileCallee(
for (BasicBlock &BB : *Callee)
// No need to update the callsite if it is pruned during inlining.
if (!VMap || VMap->count(&BB))
- for (Instruction &I : BB)
+ for (Instruction &I : BB) {
if (CallInst *CI = dyn_cast<CallInst>(&I))
CI->updateProfWeight(NewEntryCount, PriorEntryCount);
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&I))
+ II->updateProfWeight(NewEntryCount, PriorEntryCount);
+ }
}
}
diff --git a/llvm/test/Transforms/Inline/update_invoke_value_profile.ll b/llvm/test/Transforms/Inline/update_invoke_value_profile.ll
new file mode 100644
index 00000000000000..ac5597a41fce61
--- /dev/null
+++ b/llvm/test/Transforms/Inline/update_invoke_value_profile.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -inline-threshold=1000 -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.Error = type { i32 }
+@_ZTI5Error = external constant { ptr, ptr }
+
+define i32 @callee(ptr %b) personality ptr @__gxx_personality_v0 !prof !17 {
+; CHECK-LABEL: define i32 @callee(
+; CHECK-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[E:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
+; CHECK-NEXT: [[VTABLE:%.*]] = load ptr, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VTABLE]], align 8
+; CHECK-NEXT: [[CALL:%.*]] = invoke i32 [[TMP0]](ptr [[B]])
+; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[LPAD:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK: lpad:
+; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: catch ptr @_ZTI5Error
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP1]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTI5Error)
+; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[EHCLEANUP:%.*]]
+; CHECK: catch:
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT: store i32 [[TMP6]], ptr [[E]], align 4
+; CHECK-NEXT: [[CALL3:%.*]] = invoke i32 @_ZN5Error10error_codeEv(ptr [[E]])
+; CHECK-NEXT: to label [[INVOKE_CONT2:%.*]] unwind label [[LPAD1:%.*]]
+; CHECK: invoke.cont2:
+; CHECK-NEXT: br label [[TRY_CONT]]
+; CHECK: try.cont:
+; CHECK-NEXT: [[RET_0:%.*]] = phi i32 [ [[CALL3]], [[INVOKE_CONT2]] ], [ [[CALL]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[RET_0]]
+; CHECK: lpad1:
+; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: invoke void @__cxa_end_catch()
+; CHECK-NEXT: to label [[INVOKE_CONT4:%.*]] unwind label [[TERMINATE_LPAD:%.*]]
+; CHECK: invoke.cont4:
+; CHECK-NEXT: br label [[EHCLEANUP]]
+; CHECK: ehcleanup:
+; CHECK-NEXT: [[LPAD_VAL7_MERGED:%.*]] = phi { ptr, i32 } [ [[TMP7]], [[INVOKE_CONT4]] ], [ [[TMP1]], [[LPAD]] ]
+; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL7_MERGED]]
+; CHECK: terminate.lpad:
+; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: catch ptr null
+; CHECK-NEXT: unreachable
+;
+entry:
+ %e = alloca %class.Error
+ %vtable = load ptr, ptr %b
+ %0 = load ptr, ptr %vtable
+ %call = invoke i32 %0(ptr %b)
+ to label %try.cont unwind label %lpad, !prof !15
+
+lpad:
+ %1 = landingpad { ptr, i32 }
+ cleanup
+ catch ptr @_ZTI5Error
+ %2 = extractvalue { ptr, i32 } %1, 1
+ %3 = tail call i32 @llvm.eh.typeid.for(ptr @_ZTI5Error)
+ %matches = icmp eq i32 %2, %3
+ br i1 %matches, label %catch, label %ehcleanup
+
+catch:
+ %4 = extractvalue { ptr, i32 } %1, 0
+ %5 = tail call ptr @__cxa_begin_catch(ptr %4)
+ %6 = load i32, ptr %5
+ store i32 %6, ptr %e
+ %call3 = invoke i32 @_ZN5Error10error_codeEv(ptr %e)
+ to label %invoke.cont2 unwind label %lpad1
+
+invoke.cont2:
+ br label %try.cont
+
+try.cont:
+ %ret.0 = phi i32 [ %call3, %invoke.cont2 ], [ %call, %entry ]
+ ret i32 %ret.0
+
+lpad1:
+ %7 = landingpad { ptr, i32 }
+ cleanup
+ invoke void @__cxa_end_catch()
+ to label %invoke.cont4 unwind label %terminate.lpad
+
+invoke.cont4:
+ br label %ehcleanup
+
+ehcleanup:
+ %lpad.val7.merged = phi { ptr, i32 } [ %7, %invoke.cont4 ], [ %1, %lpad ]
+ resume { ptr, i32 } %lpad.val7.merged
+
+terminate.lpad:
+ %8 = landingpad { ptr, i32 }
+ catch ptr null
+ unreachable
+}
+
+define i32 @caller(ptr %b) !prof !16 {
+; CHECK-LABEL: define i32 @caller(
+; CHECK-SAME: ptr [[B:%.*]]) personality ptr @__gxx_personality_v0 !prof [[PROF2:![0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[E_I:%.*]] = alloca [[CLASS_ERROR:%.*]], align 8
+; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[E_I]])
+; CHECK-NEXT: [[VTABLE_I:%.*]] = load ptr, ptr [[B]], align 8
+; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VTABLE_I]], align 8
+; CHECK-NEXT: [[CALL_I:%.*]] = invoke i32 [[TMP0]](ptr [[B]])
+; CHECK-NEXT: to label [[CALLEE_EXIT:%.*]] unwind label [[LPAD_I:%.*]], !prof [[PROF3:![0-9]+]]
+; CHECK: lpad.i:
+; CHECK-NEXT: [[TMP1:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: catch ptr @_ZTI5Error
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { ptr, i32 } [[TMP1]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.eh.typeid.for(ptr @_ZTI5Error)
+; CHECK-NEXT: [[MATCHES_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT: br i1 [[MATCHES_I]], label [[CATCH_I:%.*]], label [[EHCLEANUP_I:%.*]]
+; CHECK: catch.i:
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { ptr, i32 } [[TMP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = tail call ptr @__cxa_begin_catch(ptr [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-NEXT: store i32 [[TMP6]], ptr [[E_I]], align 4
+; CHECK-NEXT: [[CALL3_I:%.*]] = invoke i32 @_ZN5Error10error_codeEv(ptr [[E_I]])
+; CHECK-NEXT: to label [[INVOKE_CONT2_I:%.*]] unwind label [[LPAD1_I:%.*]]
+; CHECK: invoke.cont2.i:
+; CHECK-NEXT: br label [[CALLEE_EXIT]]
+; CHECK: lpad1.i:
+; CHECK-NEXT: [[TMP7:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: cleanup
+; CHECK-NEXT: invoke void @__cxa_end_catch()
+; CHECK-NEXT: to label [[INVOKE_CONT4_I:%.*]] unwind label [[TERMINATE_LPAD_I:%.*]]
+; CHECK: invoke.cont4.i:
+; CHECK-NEXT: br label [[EHCLEANUP_I]]
+; CHECK: ehcleanup.i:
+; CHECK-NEXT: [[LPAD_VAL7_MERGED_I:%.*]] = phi { ptr, i32 } [ [[TMP7]], [[INVOKE_CONT4_I]] ], [ [[TMP1]], [[LPAD_I]] ]
+; CHECK-NEXT: resume { ptr, i32 } [[LPAD_VAL7_MERGED_I]]
+; CHECK: terminate.lpad.i:
+; CHECK-NEXT: [[TMP8:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT: catch ptr null
+; CHECK-NEXT: unreachable
+; CHECK: callee.exit:
+; CHECK-NEXT: [[RET_0_I:%.*]] = phi i32 [ [[CALL3_I]], [[INVOKE_CONT2_I]] ], [ [[CALL_I]], [[ENTRY:%.*]] ]
+; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[E_I]])
+; CHECK-NEXT: ret i32 [[RET_0_I]]
+;
+entry:
+ %call = tail call i32 @callee(ptr %b)
+ ret i32 %call
+}
+
+declare i32 @__gxx_personality_v0(...)
+declare i32 @llvm.eh.typeid.for(ptr)
+declare ptr @__cxa_begin_catch(ptr)
+declare i32 @_ZN5Error10error_codeEv(ptr)
+declare void @__cxa_end_catch()
+
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"VP", i32 0, i64 1500, i64 9261744921105590125, i64 1500}
+!16 = !{!"function_entry_count", i64 1000}
+!17 = !{!"function_entry_count", i64 1500}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 500}
+; CHECK: [[PROF1]] = !{!"VP", i32 0, i64 500, i64 -9184999152603961491, i64 500}
+; CHECK: [[PROF2]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF3]] = !{!"VP", i32 0, i64 1000, i64 -9184999152603961491, i64 1000}
+;.
|
%class.Error = type { i32 } | ||
@_ZTI5Error = external constant { ptr, ptr } | ||
|
||
define i32 @callee(ptr %b) personality ptr @__gxx_personality_v0 !prof !17 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The IR is reduced from this C++ example https://gcc.godbolt.org/z/brq413zf9
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The invoke instruction can have 3 different kinds of prof data
-
call count (if a direct call)
-
VP profile data (if an indirect call)
-
branch weights for landing pad.
-
can coexist with 2) and does not need to be updated. Is there an existing test coverage for type 1) update?
thanks for the list. I get the point that 1 and 2 are mutually exclusive. For my understanding, which one out of {1, 2, 3} doesn't need to be updated?
Searching under Meanwhile, non-call instructions could have branch weights (notably |
Added test cases for 1 and 2 in the pre-commit PR so the diff in this one is clearer. As discussed offline, non-count prof data (i.e., representing taken or not taken branches, associated with |
Actually I just realized I'll update the method and add a regression test to catch this. |
…ions, and add regression test
Bump. |
llvm/lib/IR/ProfDataUtils.cpp
Outdated
// If an instruction is a call and its branch weight has more than two | ||
// operands, it represents taken vs not-taken branch probabilities and doesn't | ||
// need scaling. | ||
if (isa<CallBase>(&I) && ProfDataName->getString().equals("branch_weights") && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we introduce a common API to get the profile annotation type? BR_Weight, Call_Count ...?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably as a pre-cleanup NFC patch.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed added hasCountTypeMD
in ProfDataUtils.cpp. PTAL, thanks!
return true; | ||
// Conservatively assume non CallBase instruction only get taken/not-taken | ||
// branch probability, so not interpret them as count. | ||
return isa<CallBase>(I) && !isBranchWeightMD(ProfileData); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that the check of CallBase is not necessary. Select instruction is another one taking prof MD, and it can only be branch weight type.
If checking callbase is not needed, passing Profile meta data pointer is better.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Non-CallBase instructions (e.g., SwitchInst) could have branch weights with two ops (i.e., one branch_weight
string and one int), and the helper function hasCountTypeMD
would be more self-contained to return false
(tell caller it's not count-type profile) since we know only CallInst or InvokeInst carry count-type values in branch_weights
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, I forgot about SwitchInst with only default case. (Note that the documentation is out of date. SelectInst also supports branch_weights). LGTM.
A related change is https://reviews.llvm.org/D133121, which correctly preserves both branch weights and value profiles for invoke instruction.
invokeinst
specifies taken / not-taken branches, there is no scale.