From fb45f3c9486f5d9e3003db95386432562b23577c Mon Sep 17 00:00:00 2001 From: Ruobing Han Date: Sun, 4 Sep 2022 12:56:25 -0400 Subject: [PATCH] [SimpleLoopUnswitch] Skip non-trivial unswitching of cold functions In the current main branch, all cold loops will not be applied non-trivial unswitch. As reported in D129599, skipping these cold loops will incur regression in SPEC benchmark. Thus, instead of skipping cold loops, now only skipping loops in cold functions. Reviewed By: alexgatea, aeubanks Differential Revision: https://reviews.llvm.org/D133275 --- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 2 +- .../PGO-nontrivial-unswitch.ll | 80 ++----------------- .../PGO-nontrivial-unswitch2.ll | 71 +++++++++------- 3 files changed, 51 insertions(+), 102 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 70f97d807cb54..e1d5bb5d6c8c0 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -3086,7 +3086,7 @@ unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, AssumptionCache &AC, // Skip cold loops, as unswitching them brings little benefit // but increases the code size if (PSI && PSI->hasProfileSummary() && BFI && - PSI->isColdBlock(L.getHeader(), BFI)) { + PSI->isFunctionColdInCallGraph(L.getHeader()->getParent(), *BFI)) { LLVM_DEBUG(dbgs() << " Skip cold loop: " << L << "\n"); return false; } diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll index c442f04798978..eeb5014cb47f9 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch.ll @@ -6,89 +6,27 @@ declare i32 @a() declare i32 @b() - +; Check loops in cold functions will not be applied non-trivial loop unswitch define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !0 { ; CHECK-LABEL: @f1( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[ENTRY_HOT_LOOP:%.*]] -; CHECK: entry_hot_loop: -; CHECK-NEXT: br i1 [[HOT_COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER:%.*]], label [[HOT_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] -; CHECK: hot_loop_begin.preheader: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[HOT_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] -; CHECK: hot_loop_begin.preheader.split.us: -; CHECK-NEXT: br label [[HOT_LOOP_BEGIN_US:%.*]] -; CHECK: hot_loop_begin.us: -; CHECK-NEXT: br label [[HOT_LOOP_A_US:%.*]] -; CHECK: hot_loop_a.us: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[HOT_LOOP_LATCH_US:%.*]] -; CHECK: hot_loop_latch.us: -; CHECK-NEXT: [[V1_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V1_US]], label [[HOT_LOOP_BEGIN_US]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] -; CHECK: hot_loop_exit.loopexit.split.us: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: hot_loop_begin.preheader.split: -; CHECK-NEXT: br label [[HOT_LOOP_BEGIN:%.*]] -; CHECK: hot_loop_begin: -; CHECK-NEXT: br label [[HOT_LOOP_B:%.*]] -; CHECK: hot_loop_b: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[HOT_LOOP_LATCH:%.*]] -; CHECK: hot_loop_latch: -; CHECK-NEXT: [[V1:%.*]] = load i1, i1* [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V1]], label [[HOT_LOOP_BEGIN]], label [[HOT_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] -; CHECK: hot_loop_exit.loopexit.split: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT_LOOPEXIT]] -; CHECK: hot_loop_exit.loopexit: -; CHECK-NEXT: br label [[HOT_LOOP_EXIT]] -; CHECK: hot_loop_exit: -; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] -; CHECK: entry_cold_loop: -; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF16:![0-9]+]] -; CHECK: cold_loop_begin.preheader: ; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] ; CHECK: cold_loop_begin: -; CHECK-NEXT: br i1 [[COND]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] ; CHECK: cold_loop_a: -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @a() +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() ; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] ; CHECK: cold_loop_b: -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @b() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() ; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] ; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] -; CHECK: cold_loop_exit.loopexit: -; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] +; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR:%.*]], align 1 +; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT:%.*]] ; CHECK: cold_loop_exit: ; CHECK-NEXT: ret void ; entry: - br label %entry_hot_loop - -entry_hot_loop: - br i1 %hot_cond, label %hot_loop_begin, label %hot_loop_exit, !prof !15 - -hot_loop_begin: - br i1 %cond, label %hot_loop_a, label %hot_loop_b - -hot_loop_a: - call i32 @a() - br label %hot_loop_latch - -hot_loop_b: - call i32 @b() - br label %hot_loop_latch - -hot_loop_latch: - %v1 = load i1, i1* %ptr - br i1 %v1, label %hot_loop_begin, label %hot_loop_exit - -hot_loop_exit: - br label %entry_cold_loop - -entry_cold_loop: - br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !16 + br label %cold_loop_begin cold_loop_begin: br i1 %cond, label %cold_loop_a, label %cold_loop_b @@ -110,7 +48,7 @@ cold_loop_exit: } !llvm.module.flags = !{!1} -!0 = !{!"function_entry_count", i64 400} +!0 = !{!"function_entry_count", i64 0} !1 = !{i32 1, !"ProfileSummary", !2} !2 = !{!3, !4, !5, !6, !7, !8, !9, !10} !3 = !{!"ProfileFormat", !"InstrProf"} @@ -125,5 +63,3 @@ cold_loop_exit: !12 = !{i32 10000, i64 100, i32 1} !13 = !{i32 999000, i64 100, i32 1} !14 = !{i32 999999, i64 1, i32 2} -!15 = !{!"branch_weights", i32 100, i32 0} -!16 = !{!"branch_weights", i32 0, i32 100} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll index cc3df2faaa904..452b4d876d937 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/PGO-nontrivial-unswitch2.ll @@ -8,25 +8,38 @@ declare i32 @b() ; Check loops will be applied non-trivial loop unswitch in a non-cold function, ; even loop headers are cold -define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !0 { +define void @f1(i32 %i, i1 %cond, i1 %hot_cond, i1 %cold_cond, i1* %ptr) !prof !14 { ; CHECK-LABEL: @f1( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[ENTRY_COLD_LOOP:%.*]] ; CHECK: entry_cold_loop: ; CHECK-NEXT: br i1 [[COLD_COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER:%.*]], label [[COLD_LOOP_EXIT:%.*]], !prof [[PROF15:![0-9]+]] ; CHECK: cold_loop_begin.preheader: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT_US:%.*]], label [[COLD_LOOP_BEGIN_PREHEADER_SPLIT:%.*]] +; CHECK: cold_loop_begin.preheader.split.us: +; CHECK-NEXT: br label [[COLD_LOOP_BEGIN_US:%.*]] +; CHECK: cold_loop_begin.us: +; CHECK-NEXT: br label [[COLD_LOOP_A_US:%.*]] +; CHECK: cold_loop_a.us: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() +; CHECK-NEXT: br label [[COLD_LOOP_LATCH_US:%.*]] +; CHECK: cold_loop_latch.us: +; CHECK-NEXT: [[V2_US:%.*]] = load i1, i1* [[PTR:%.*]], align 1 +; CHECK-NEXT: br i1 [[V2_US]], label [[COLD_LOOP_BEGIN_US]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] +; CHECK: cold_loop_exit.loopexit.split.us: +; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK: cold_loop_begin.preheader.split: ; CHECK-NEXT: br label [[COLD_LOOP_BEGIN:%.*]] ; CHECK: cold_loop_begin: -; CHECK-NEXT: br i1 [[COND:%.*]], label [[COLD_LOOP_A:%.*]], label [[COLD_LOOP_B:%.*]] -; CHECK: cold_loop_a: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @a() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] +; CHECK-NEXT: br label [[COLD_LOOP_B:%.*]] ; CHECK: cold_loop_b: ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @b() -; CHECK-NEXT: br label [[COLD_LOOP_LATCH]] +; CHECK-NEXT: br label [[COLD_LOOP_LATCH:%.*]] ; CHECK: cold_loop_latch: -; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR:%.*]], align 1 -; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT:%.*]] +; CHECK-NEXT: [[V2:%.*]] = load i1, i1* [[PTR]], align 1 +; CHECK-NEXT: br i1 [[V2]], label [[COLD_LOOP_BEGIN]], label [[COLD_LOOP_EXIT_LOOPEXIT_SPLIT:%.*]] +; CHECK: cold_loop_exit.loopexit.split: +; CHECK-NEXT: br label [[COLD_LOOP_EXIT_LOOPEXIT]] ; CHECK: cold_loop_exit.loopexit: ; CHECK-NEXT: br label [[COLD_LOOP_EXIT]] ; CHECK: cold_loop_exit: @@ -36,17 +49,17 @@ entry: br label %entry_cold_loop entry_cold_loop: - br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !16 + br i1 %cold_cond, label %cold_loop_begin, label %cold_loop_exit, !prof !15 cold_loop_begin: br i1 %cond, label %cold_loop_a, label %cold_loop_b cold_loop_a: - call i32 @a() + %0 = call i32 @a() br label %cold_loop_latch cold_loop_b: - call i32 @b() + %1 = call i32 @b() br label %cold_loop_latch cold_loop_latch: @@ -57,21 +70,21 @@ cold_loop_exit: ret void } -!llvm.module.flags = !{!1} -!0 = !{!"function_entry_count", i64 400} -!1 = !{i32 1, !"ProfileSummary", !2} -!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} -!3 = !{!"ProfileFormat", !"InstrProf"} -!4 = !{!"TotalCount", i64 10000} -!5 = !{!"MaxCount", i64 10} -!6 = !{!"MaxInternalCount", i64 1} -!7 = !{!"MaxFunctionCount", i64 1000} -!8 = !{!"NumCounts", i64 3} -!9 = !{!"NumFunctions", i64 3} -!10 = !{!"DetailedSummary", !11} -!11 = !{!12, !13, !14} -!12 = !{i32 10000, i64 100, i32 1} -!13 = !{i32 999000, i64 100, i32 1} -!14 = !{i32 999999, i64 1, i32 2} -!15 = !{!"branch_weights", i32 100, i32 0} -!16 = !{!"branch_weights", i32 0, i32 100} +!llvm.module.flags = !{!0} + +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2} +!14 = !{!"function_entry_count", i64 400} +!15 = !{!"branch_weights", i32 0, i32 100}