diff --git a/bolt/include/bolt/Passes/InferNonStale.h b/bolt/include/bolt/Passes/InferNonStale.h new file mode 100644 index 0000000000000..16e7aecbd6eb9 --- /dev/null +++ b/bolt/include/bolt/Passes/InferNonStale.h @@ -0,0 +1,41 @@ +//===- bolt/Passes/InferNonStale.h - Non-stale profile inference --------*- C++ +//-*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the pass that runs stale profile matching on functions +// with non-stale/non-inferred profile to improve profile quality. +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_INFERNONSTALE_H +#define BOLT_PASSES_INFERNONSTALE_H + +#include "bolt/Passes/BinaryPasses.h" + +namespace llvm { +namespace bolt { + +/// Run stale profile matching inference on functions with non-stale profile +/// to improve edge count estimates and profile quality. +class InferNonStale : public BinaryFunctionPass { + void runOnFunction(BinaryFunction &BF); + +public: + explicit InferNonStale(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { return "infer-non-stale"; } + + /// Pass entry point + Error runOnFunctions(BinaryContext &BC) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index 77d2bb9c2bcb5..9f36d3e02afb5 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -16,6 +16,7 @@ add_llvm_library(LLVMBOLTPasses Hugify.cpp IdenticalCodeFolding.cpp IndirectCallPromotion.cpp + InferNonStale.cpp Inliner.cpp Instrumentation.cpp JTFootprintReduction.cpp @@ -64,5 +65,6 @@ add_llvm_library(LLVMBOLTPasses target_link_libraries(LLVMBOLTPasses PRIVATE LLVMBOLTCore + LLVMBOLTProfile LLVMBOLTUtils ) diff --git a/bolt/lib/Passes/InferNonStale.cpp b/bolt/lib/Passes/InferNonStale.cpp new file mode 100644 index 0000000000000..f7d1944ecb0c4 --- /dev/null +++ b/bolt/lib/Passes/InferNonStale.cpp @@ -0,0 +1,168 @@ +//===- bolt/Passes/InferNonStale.cpp - Non-stale profile inference ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the InferNonStale pass that runs stale profile +// matching on functions with non-stale/non-inferred profile to improve +// profile quality. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/InferNonStale.h" + +#include "bolt/Core/BinaryFunction.h" +#include "bolt/Core/ParallelUtilities.h" +#include "bolt/Utils/CommandLineOpts.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Timer.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" + +#undef DEBUG_TYPE +#define DEBUG_TYPE "infer-non-stale" + +using namespace llvm; +using namespace bolt; + +namespace opts { + +extern cl::opt TimeRewrite; +extern cl::OptionCategory BoltOptCategory; + +cl::opt + InferNonStaleProfile("infer-non-stale-profile", + cl::desc("Infer profile counts for functions with " + "non-stale profile using profi"), + cl::init(false), cl::cat(BoltOptCategory)); + +// Reuse existing stale matching parameters +extern cl::opt StaleMatchingEvenFlowDistribution; +extern cl::opt StaleMatchingRebalanceUnknown; +extern cl::opt StaleMatchingJoinIslands; +extern cl::opt StaleMatchingCostBlockInc; +extern cl::opt StaleMatchingCostBlockDec; +extern cl::opt StaleMatchingCostJumpInc; +extern cl::opt StaleMatchingCostJumpDec; +extern cl::opt StaleMatchingCostBlockUnknownInc; +extern cl::opt StaleMatchingCostJumpUnknownInc; +extern cl::opt StaleMatchingCostJumpUnknownFTInc; + +} // namespace opts + +namespace llvm { +namespace bolt { + +// Forward declarations of functions from StaleProfileMatching.cpp +FlowFunction +createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder); +void preprocessUnreachableBlocks(FlowFunction &Func); +void assignProfile(BinaryFunction &BF, + const BinaryFunction::BasicBlockOrderType &BlockOrder, + FlowFunction &Func); + +} // namespace bolt +} // namespace llvm + +namespace llvm { +namespace bolt { + +void InferNonStale::runOnFunction(BinaryFunction &BF) { + NamedRegionTimer T("inferNonStale", "non-stale profile inference", "rewrite", + "Rewrite passes", opts::TimeRewrite); + + assert(BF.hasCFG() && "Function must have CFG"); + + // Only process functions with profile that are not already inferred + assert(BF.hasValidProfile() && "Function must have valid profile"); + + assert(!BF.hasInferredProfile() && "Function must not have inferred profile"); + + LLVM_DEBUG(dbgs() << "BOLT-INFO: applying non-stale profile inference for " + << "\"" << BF.getPrintName() << "\"\n"); + + // Make sure that block hashes are up to date. + BF.computeBlockHashes(); + + const BinaryFunction::BasicBlockOrderType BlockOrder( + BF.getLayout().block_begin(), BF.getLayout().block_end()); + + // Create a wrapper flow function to use with the profile inference algorithm. + FlowFunction Func = createFlowFunction(BlockOrder); + + // Assign existing profile counts to the flow function + // This differs from stale matching - we use existing counts directly + for (uint64_t I = 0; I < BlockOrder.size(); I++) { + BinaryBasicBlock *BB = BlockOrder[I]; + FlowBlock &Block = Func.Blocks[I + 1]; // Skip dummy entry block + + // Set block weight from existing execution count + Block.Weight = BB->getKnownExecutionCount(); + Block.HasUnknownWeight = (Block.Weight == 0); + + // Set jump weights from existing branch info + for (FlowJump *Jump : Block.SuccJumps) { + if (Jump->Target == Func.Blocks.size() - 1) // Skip artificial sink + continue; + + BinaryBasicBlock *SuccBB = BlockOrder[Jump->Target - 1]; + if (BB->getSuccessor(SuccBB->getLabel())) { + BinaryBasicBlock::BinaryBranchInfo &BI = BB->getBranchInfo(*SuccBB); + Jump->Weight = BI.Count; + Jump->HasUnknownWeight = (Jump->Weight == 0); + } + } + } + + // Adjust the flow function by marking unreachable blocks Unlikely + preprocessUnreachableBlocks(Func); + + // Set up inference parameters + ProfiParams Params; + Params.EvenFlowDistribution = opts::StaleMatchingEvenFlowDistribution; + Params.RebalanceUnknown = opts::StaleMatchingRebalanceUnknown; + Params.JoinIslands = opts::StaleMatchingJoinIslands; + + Params.CostBlockInc = opts::StaleMatchingCostBlockInc; + Params.CostBlockEntryInc = opts::StaleMatchingCostBlockInc; + Params.CostBlockDec = opts::StaleMatchingCostBlockDec; + Params.CostBlockEntryDec = opts::StaleMatchingCostBlockDec; + Params.CostBlockUnknownInc = opts::StaleMatchingCostBlockUnknownInc; + + Params.CostJumpInc = opts::StaleMatchingCostJumpInc; + Params.CostJumpFTInc = opts::StaleMatchingCostJumpInc; + Params.CostJumpDec = opts::StaleMatchingCostJumpDec; + Params.CostJumpFTDec = opts::StaleMatchingCostJumpDec; + Params.CostJumpUnknownInc = opts::StaleMatchingCostJumpUnknownInc; + Params.CostJumpUnknownFTInc = opts::StaleMatchingCostJumpUnknownFTInc; + + // Apply the profile inference algorithm + applyFlowInference(Params, Func); + + // Collect inferred counts and update function annotations + assignProfile(BF, BlockOrder, Func); + + // Mark the function as having inferred profile + BF.setHasInferredProfile(true); +} + +Error InferNonStale::runOnFunctions(BinaryContext &BC) { + ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) { + runOnFunction(BF); + }; + + ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { + return !BF.hasValidProfile() || BF.hasInferredProfile() || !BF.hasCFG(); + }; + + ParallelUtilities::runOnEachFunction( + BC, ParallelUtilities::SchedulingPolicy::SP_BB_QUADRATIC, WorkFun, + SkipFunc, "InferNonStale"); + + return Error::success(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp index 1a61949d77472..b66a3f478f1a7 100644 --- a/bolt/lib/Profile/StaleProfileMatching.cpp +++ b/bolt/lib/Profile/StaleProfileMatching.cpp @@ -52,66 +52,66 @@ cl::opt cl::desc("Infer counts from stale profile data."), cl::init(false), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingMinMatchedBlock( +cl::opt StaleMatchingMinMatchedBlock( "stale-matching-min-matched-block", cl::desc("Percentage threshold of matched basic blocks at which stale " "profile inference is executed."), cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingMaxFuncSize( +cl::opt StaleMatchingMaxFuncSize( "stale-matching-max-func-size", cl::desc("The maximum size of a function to consider for inference."), cl::init(10000), cl::Hidden, cl::cat(BoltOptCategory)); // Parameters of the profile inference algorithm. The default values are tuned // on several benchmarks. -static cl::opt StaleMatchingEvenFlowDistribution( +cl::opt StaleMatchingEvenFlowDistribution( "stale-matching-even-flow-distribution", cl::desc("Try to evenly distribute flow when there are multiple equally " "likely options."), cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingRebalanceUnknown( +cl::opt StaleMatchingRebalanceUnknown( "stale-matching-rebalance-unknown", cl::desc("Evenly re-distribute flow among unknown subgraphs."), cl::init(false), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingJoinIslands( +cl::opt StaleMatchingJoinIslands( "stale-matching-join-islands", cl::desc("Join isolated components having positive flow."), cl::init(true), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostBlockInc( +cl::opt StaleMatchingCostBlockInc( "stale-matching-cost-block-inc", cl::desc("The cost of increasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostBlockDec( +cl::opt StaleMatchingCostBlockDec( "stale-matching-cost-block-dec", cl::desc("The cost of decreasing a block count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostJumpInc( +cl::opt StaleMatchingCostJumpInc( "stale-matching-cost-jump-inc", cl::desc("The cost of increasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostJumpDec( +cl::opt StaleMatchingCostJumpDec( "stale-matching-cost-jump-dec", cl::desc("The cost of decreasing a jump count by one."), cl::init(150), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostBlockUnknownInc( +cl::opt StaleMatchingCostBlockUnknownInc( "stale-matching-cost-block-unknown-inc", cl::desc("The cost of increasing an unknown block count by one."), cl::init(1), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostJumpUnknownInc( +cl::opt StaleMatchingCostJumpUnknownInc( "stale-matching-cost-jump-unknown-inc", cl::desc("The cost of increasing an unknown jump count by one."), cl::init(140), cl::ReallyHidden, cl::cat(BoltOptCategory)); -static cl::opt StaleMatchingCostJumpUnknownFTInc( +cl::opt StaleMatchingCostJumpUnknownFTInc( "stale-matching-cost-jump-unknown-ft-inc", cl::desc( "The cost of increasing an unknown fall-through jump count by one."), diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index d9b7a2bd9a14c..2aa7a8d90cdc5 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -18,6 +18,7 @@ #include "bolt/Passes/Hugify.h" #include "bolt/Passes/IdenticalCodeFolding.h" #include "bolt/Passes/IndirectCallPromotion.h" +#include "bolt/Passes/InferNonStale.h" #include "bolt/Passes/Inliner.h" #include "bolt/Passes/Instrumentation.h" #include "bolt/Passes/JTFootprintReduction.h" @@ -58,6 +59,7 @@ extern cl::opt PLT; extern cl::opt ICF; +extern cl::opt InferNonStaleProfile; static cl::opt DynoStatsAll("dyno-stats-all", @@ -98,6 +100,11 @@ static cl::opt PrintEstimateEdgeCounts( cl::desc("print function after edge counts are set for no-LBR profile"), cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt PrintInferNonStale( + "print-infer-non-stale", + cl::desc("print function after non-stale profile inference"), cl::Hidden, + cl::cat(BoltOptCategory)); + cl::opt PrintFinalized("print-finalized", cl::desc("print function after CFG is finalized"), @@ -384,6 +391,15 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass(std::make_unique(NeverPrint)); + // Optionally run profile inference on non-stale profiles + if (opts::InferNonStaleProfile) { + Manager.registerPass(std::make_unique(PrintInferNonStale)); + + // Print profile quality stats after inference to show improvement + Manager.registerPass( + std::make_unique(NeverPrint)); + } + Manager.registerPass(std::make_unique(NeverPrint)); Manager.registerPass(std::make_unique(NeverPrint)); diff --git a/bolt/test/X86/profile-quality-reporting.test b/bolt/test/X86/profile-quality-reporting.test index 210d3e10a3890..9908f1e2fe5ce 100644 --- a/bolt/test/X86/profile-quality-reporting.test +++ b/bolt/test/X86/profile-quality-reporting.test @@ -2,3 +2,8 @@ RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt | FileCheck %s CHECK: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst); exception handling usage 0.00% (of total BBEC) 0.00% (of total InvokeEC) + +## Check profile quality with infer-non-stale-profile option +RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \ +RUN: --infer-non-stale-profile | FileCheck %s --check-prefix CHECK-INFER +CHECK-INFER: profile quality metrics for the hottest 5 functions (reporting top 5% values): function CFG discontinuity 100.00%; call graph flow conservation gap 60.00%; CFG flow conservation gap 45.53% (weighted) 96.87% (worst); exception handling usage 0.00% (of total BBEC) 0.00% (of total InvokeEC)