Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement callsite-hotness based inline cost for Sample-based PGO
Summary: For sample-based PGO, using BFI to calculate callsite count is sometime not accurate. This is because with sampling based approach, if a callsite resides in a hot loop deeply nested in a bunch of cold branches, the callsite's BFI frequency would be inaccurately calculated due to lack of samples in the cold branch. E.g. if (A1 && A2 && A3 && ..... && A10) { for (i=0; i < 100000000; i++) { callsite(); } } Assume that A1 to A100 are all 100% taken, and callsite has 1000 samples and thus is considerred hot. Because the loop's trip count is huge, it's normal that all branches outside the loop has no sample at all. As a result, we can only use static branch probability to derive the the frequency of the loop header. Assuming that static heuristic thinks each branch is 50% taken, then the count calculated from BFI will be 1/(2^10) of the actual value. In order to get more accurate callsite count, we directly annotate the weight on the call instruction, and directly use it when checking callsite hotness. Note that this mechanism can also be shared by instrumentation based callsite hotness analysis. The side benefit is that it breaks the dependency from Inliner to BFI as call count is embedded in the IR. Reviewers: davidxl, eraman, dnovillo Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D22118 llvm-svn: 275073
- Loading branch information
Showing
5 changed files
with
103 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s | ||
|
||
; This tests that a hot callsite gets the (higher) inlinehint-threshold even without | ||
; without inline hints and gets inlined because the cost is less than | ||
; inlinehint-threshold. A cold callee with identical body does not get inlined because | ||
; cost exceeds the inline-threshold | ||
|
||
define i32 @callee1(i32 %x) { | ||
%x1 = add i32 %x, 1 | ||
%x2 = add i32 %x1, 1 | ||
%x3 = add i32 %x2, 1 | ||
|
||
ret i32 %x3 | ||
} | ||
|
||
define i32 @callee2(i32 %x) { | ||
; CHECK-LABEL: @callee2( | ||
%x1 = add i32 %x, 1 | ||
%x2 = add i32 %x1, 1 | ||
%x3 = add i32 %x2, 1 | ||
|
||
ret i32 %x3 | ||
} | ||
|
||
define i32 @caller2(i32 %y1) { | ||
; CHECK-LABEL: @caller2( | ||
; CHECK: call i32 @callee2 | ||
; CHECK-NOT: call i32 @callee1 | ||
; CHECK: ret i32 %x3.i | ||
%y2 = call i32 @callee2(i32 %y1), !prof !22 | ||
%y3 = call i32 @callee1(i32 %y2), !prof !21 | ||
ret i32 %y3 | ||
} | ||
|
||
!llvm.module.flags = !{!1} | ||
!21 = !{!"branch_weights", i64 300} | ||
!22 = !{!"branch_weights", i64 1} | ||
|
||
!1 = !{i32 1, !"ProfileSummary", !2} | ||
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} | ||
!3 = !{!"ProfileFormat", !"InstrProf"} | ||
!4 = !{!"TotalCount", i64 10000} | ||
!5 = !{!"MaxCount", i64 1000} | ||
!6 = !{!"MaxInternalCount", i64 1} | ||
!7 = !{!"MaxFunctionCount", i64 1000} | ||
!8 = !{!"NumCounts", i64 3} | ||
!9 = !{!"NumFunctions", i64 3} | ||
!10 = !{!"DetailedSummary", !11} | ||
!11 = !{!12, !13, !14} | ||
!12 = !{i32 10000, i64 100, i32 1} | ||
!13 = !{i32 999000, i64 100, i32 1} | ||
!14 = !{i32 999999, i64 1, i32 2} |