diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 285645b8ba47f4..022861318ce95c 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -675,8 +675,12 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); + // Don't add CHR pass for CSIRInstr build in PostLink as the profile + // is still the same as the PreLink compilation. if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && - (PGOOpt->Action == PGOOptions::IRUse || + ((PGOOpt->Action == PGOOptions::IRUse && + (Phase != ThinOrFullLTOPhase::ThinLTOPostLink || + PGOOpt->CSAction != PGOOptions::CSIRInstr)) || PGOOpt->Action == PGOOptions::SampleUse)) FPM.addPass(ControlHeightReductionPass()); diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp index 8ecf2cfdcbe137..aee9718ff0c780 100644 --- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp +++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp @@ -47,6 +47,9 @@ using namespace llvm; #define CHR_DEBUG(X) LLVM_DEBUG(X) +static cl::opt DisableCHR("disable-chr", cl::init(false), cl::Hidden, + cl::desc("Disable CHR for all functions")); + static cl::opt ForceCHR("force-chr", cl::init(false), cl::Hidden, cl::desc("Apply CHR for all functions")); @@ -66,6 +69,10 @@ static cl::opt CHRFunctionList( "chr-function-list", cl::init(""), cl::Hidden, cl::desc("Specify file to retrieve the list of functions to apply CHR to")); +static cl::opt CHRDupThreshsold( + "chr-dup-threshold", cl::init(3), cl::Hidden, + cl::desc("Max number of duplications by CHR for a region")); + static StringSet<> CHRModules; static StringSet<> CHRFunctions; @@ -339,23 +346,27 @@ class CHR { BasicBlock *EntryBlock, BasicBlock *NewEntryBlock, ValueToValueMapTy &VMap); - void fixupBranchesAndSelects(CHRScope *Scope, - BasicBlock *PreEntryBlock, - BranchInst *MergedBR, - uint64_t ProfileCount); - void fixupBranch(Region *R, - CHRScope *Scope, - IRBuilder<> &IRB, + void fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock, + BranchInst *MergedBR, uint64_t ProfileCount); + void fixupBranch(Region *R, CHRScope *Scope, IRBuilder<> &IRB, Value *&MergedCondition, BranchProbability &CHRBranchBias); - void fixupSelect(SelectInst* SI, - CHRScope *Scope, - IRBuilder<> &IRB, + void fixupSelect(SelectInst *SI, CHRScope *Scope, IRBuilder<> &IRB, Value *&MergedCondition, BranchProbability &CHRBranchBias); void addToMergedCondition(bool IsTrueBiased, Value *Cond, - Instruction *BranchOrSelect, - CHRScope *Scope, - IRBuilder<> &IRB, - Value *&MergedCondition); + Instruction *BranchOrSelect, CHRScope *Scope, + IRBuilder<> &IRB, Value *&MergedCondition); + unsigned getRegionDuplicationCount(const Region *R) { + unsigned Count = 0; + // Find out how many times region R is cloned. Note that if the parent + // of R is cloned, R is also cloned, but R's clone count is not updated + // from the clone of the parent. We need to accumlate all the counts + // from the ancestors to get the clone count. + while (R) { + Count += DuplicationCount[R]; + R = R->getParent(); + } + return Count; + } Function &F; BlockFrequencyInfo &BFI; @@ -379,6 +390,8 @@ class CHR { DenseMap SelectBiasMap; // All the scopes. DenseSet Scopes; + // This maps records how many times this region is cloned. + DenseMap DuplicationCount; }; } // end anonymous namespace @@ -396,7 +409,10 @@ raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) { return OS; } -static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) { +static bool shouldApply(Function &F, ProfileSummaryInfo &PSI) { + if (DisableCHR) + return false; + if (ForceCHR) return true; @@ -1666,6 +1682,26 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet &TrivialPHIs) { CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n"); assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region"); + + for (RegInfo &RI : Scope->RegInfos) { + const Region *R = RI.R; + unsigned Duplication = getRegionDuplicationCount(R); + dbgs() << "Dup count for R=" << R << " is " << Duplication << "\n"; + if (Duplication >= CHRDupThreshsold) { + CHR_DEBUG(dbgs() << "Reached the dup threshold of " << Duplication + << " for this region"); + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "DupThresholdReached", + R->getEntry()->getTerminator()) + << "Reached the duplication threshold for the region"; + }); + return; + } + } + for (RegInfo &RI : Scope->RegInfos) { + DuplicationCount[RI.R]++; + } + Region *FirstRegion = Scope->RegInfos[0].R; BasicBlock *EntryBlock = FirstRegion->getEntry(); Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R; diff --git a/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll b/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll new file mode 100644 index 00000000000000..d3c3cb2e7ca702 --- /dev/null +++ b/llvm/test/Transforms/PGOProfile/chr-dup-threshold.ll @@ -0,0 +1,195 @@ +; Test case for capping the cloning in CHR. +; RUN: opt < %s -passes='require,function(chr)' -chr-dup-threshold=2 -S | FileCheck %s + +; c sources for the test case. +; extern void foo(int); +; __attribute__((noinline)) void goo(int r, int s, int t) { +; if ((r & 2) != 0) { +; if ((s & 2) != 0) { +; if ((t & 2) != 0) { +; foo(111); +; } +; if ((t & 4) != 0) { +; foo(112); +; } +; } +; if ((s & 4) != 0) { +; if ((t & 2) != 0) { +; foo(121); +; } +; if ((t & 4) != 0) { +; foo(122); +; } +; } +; } +; if ((r & 4) != 0) { +; if ((s & 2) != 0) { +; if ((t & 2) != 0) { +; foo(211); +; } +; if ((t & 4) != 0) { +; foo(212); +; } +; } +; if ((s & 4) != 0) { +; if ((t & 2) != 0) { +; foo(221); +; } +; if ((t & 4) != 0) { +; foo(222); +; } +; } +; } +; } +; +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local void @goo(i32 noundef %r, i32 noundef %s, i32 noundef %t) !prof !34 { +entry: + %and = and i32 %r, 2 + %cmp.not = icmp eq i32 %and, 0 + br i1 %cmp.not, label %if.end24, label %if.then, !prof !35 + +if.then: + %and1 = and i32 %s, 2 + %cmp2.not = icmp eq i32 %and1, 0 + br i1 %cmp2.not, label %if.end11, label %if.then3, !prof !35 + +if.then3: + %and4 = and i32 %t, 2 + %cmp5.not = icmp eq i32 %and4, 0 + br i1 %cmp5.not, label %if.end, label %if.then6, !prof !35 + +if.then6: + tail call void @foo(i32 noundef 111) + br label %if.end + +if.end: + %and7 = and i32 %t, 4 + %cmp8.not = icmp eq i32 %and7, 0 + br i1 %cmp8.not, label %if.end11, label %if.then9, !prof !35 + +if.then9: + tail call void @foo(i32 noundef 112) + br label %if.end11 + +if.end11: + %and12 = and i32 %s, 4 + %cmp13.not = icmp eq i32 %and12, 0 + br i1 %cmp13.not, label %if.end24, label %if.then14, !prof !35 + +if.then14: + %and15 = and i32 %t, 2 + %cmp16.not = icmp eq i32 %and15, 0 + br i1 %cmp16.not, label %if.end18, label %if.then17, !prof !35 + +if.then17: + tail call void @foo(i32 noundef 121) + br label %if.end18 + +if.end18: + %and19 = and i32 %t, 4 + %cmp20.not = icmp eq i32 %and19, 0 + br i1 %cmp20.not, label %if.end24, label %if.then21, !prof !35 + +if.then21: + tail call void @foo(i32 noundef 122) + br label %if.end24 + +if.end24: + %and25 = and i32 %r, 4 + %cmp26.not = icmp eq i32 %and25, 0 + br i1 %cmp26.not, label %if.end52, label %if.then27, !prof !35 + +if.then27: + %and28 = and i32 %s, 2 + %cmp29.not = icmp eq i32 %and28, 0 + br i1 %cmp29.not, label %if.end39, label %if.then30, !prof !35 + +if.then30: + %and31 = and i32 %t, 2 + %cmp32.not = icmp eq i32 %and31, 0 + br i1 %cmp32.not, label %if.end34, label %if.then33, !prof !35 + +if.then33: + tail call void @foo(i32 noundef 211) + br label %if.end34 + +if.end34: + %and35 = and i32 %t, 4 + %cmp36.not = icmp eq i32 %and35, 0 + br i1 %cmp36.not, label %if.end39, label %if.then37, !prof !35 + +if.then37: + tail call void @foo(i32 noundef 212) + br label %if.end39 + +if.end39: + %and40 = and i32 %s, 4 + %cmp41.not = icmp eq i32 %and40, 0 + br i1 %cmp41.not, label %if.end52, label %if.then42, !prof !35 + +if.then42: + %and43 = and i32 %t, 2 + %cmp44.not = icmp eq i32 %and43, 0 + br i1 %cmp44.not, label %if.end46, label %if.then45, !prof !35 + +if.then45: + tail call void @foo(i32 noundef 221) + br label %if.end46 + +if.end46: + %and47 = and i32 %t, 4 + %cmp48.not = icmp eq i32 %and47, 0 + br i1 %cmp48.not, label %if.end52, label %if.then49, !prof !35 + +if.then49: + tail call void @foo(i32 noundef 222) + br label %if.end52 + +if.end52: + ret void +} + +; CHECK-LABEL: goo +; CHECK-COUNT-3: {{.*}}.split: +; CHECK-NOT: {{.*}}.split: + +declare void @foo(i32 noundef) + +!llvm.module.flags = !{!4} + +!4 = !{i32 1, !"ProfileSummary", !5} +!5 = !{!6, !7, !8, !9, !10, !11, !12, !13, !14, !15} +!6 = !{!"ProfileFormat", !"InstrProf"} +!7 = !{!"TotalCount", i64 2400001} +!8 = !{!"MaxCount", i64 800000} +!9 = !{!"MaxInternalCount", i64 100000} +!10 = !{!"MaxFunctionCount", i64 800000} +!11 = !{!"NumCounts", i64 19} +!12 = !{!"NumFunctions", i64 4} +!13 = !{!"IsPartialProfile", i64 0} +!14 = !{!"PartialProfileRatio", double 0.000000e+00} +!15 = !{!"DetailedSummary", !16} +!16 = !{!17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32} +!17 = !{i32 10000, i64 800000, i32 1} +!18 = !{i32 100000, i64 800000, i32 1} +!19 = !{i32 200000, i64 800000, i32 1} +!20 = !{i32 300000, i64 800000, i32 1} +!21 = !{i32 400000, i64 100000, i32 17} +!22 = !{i32 500000, i64 100000, i32 17} +!23 = !{i32 600000, i64 100000, i32 17} +!24 = !{i32 700000, i64 100000, i32 17} +!25 = !{i32 800000, i64 100000, i32 17} +!26 = !{i32 900000, i64 100000, i32 17} +!27 = !{i32 950000, i64 100000, i32 17} +!28 = !{i32 990000, i64 100000, i32 17} +!29 = !{i32 999000, i64 100000, i32 17} +!30 = !{i32 999900, i64 100000, i32 17} +!31 = !{i32 999990, i64 100000, i32 17} +!32 = !{i32 999999, i64 100000, i32 17} +!34 = !{!"function_entry_count", i64 100000} +!35 = !{!"branch_weights", i32 0, i32 100000} +!36 = !{!"function_entry_count", i64 1} +!37 = !{!"branch_weights", i32 100000, i32 1}