diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index fc0375b75533f..0b875ed688101 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -386,6 +386,9 @@ class BinaryFunction { /// Raw branch count for this function in the profile. uint64_t RawBranchCount{0}; + /// Dynamically executed function bytes, used for density computation. + uint64_t SampleCountInBytes{0}; + /// Indicates the type of profile the function is using. uint16_t ProfileFlags{PF_NONE}; @@ -1844,6 +1847,9 @@ class BinaryFunction { /// to this function. void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; } + /// Return the number of dynamically executed bytes, from raw perf data. + uint64_t getSampleCountInBytes() const { return SampleCountInBytes; } + /// Return the execution count for functions with known profile. /// Return 0 if the function has no profile. uint64_t getKnownExecutionCount() const { diff --git a/bolt/include/bolt/Utils/CommandLineOpts.h b/bolt/include/bolt/Utils/CommandLineOpts.h index baabeab577fb5..04bf7db5de952 100644 --- a/bolt/include/bolt/Utils/CommandLineOpts.h +++ b/bolt/include/bolt/Utils/CommandLineOpts.h @@ -55,6 +55,7 @@ extern llvm::cl::opt PrintSections; enum ProfileFormatKind { PF_Fdata, PF_YAML }; extern llvm::cl::opt ProfileFormat; +extern llvm::cl::opt ShowDensity; extern llvm::cl::opt SplitEH; extern llvm::cl::opt StrictMode; extern llvm::cl::opt TimeOpts; diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index fa95ad7324ac1..5a676185227ec 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -15,6 +15,7 @@ #include "bolt/Core/ParallelUtilities.h" #include "bolt/Passes/ReorderAlgorithm.h" #include "bolt/Passes/ReorderFunctions.h" +#include "bolt/Utils/CommandLineOpts.h" #include "llvm/Support/CommandLine.h" #include #include @@ -223,6 +224,18 @@ static cl::opt TopCalledLimit( "functions section"), cl::init(100), cl::Hidden, cl::cat(BoltCategory)); +// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp +static cl::opt ProfileDensityCutOffHot( + "profile-density-cutoff-hot", cl::init(990000), + cl::desc("Total samples cutoff for functions used to calculate " + "profile density.")); + +static cl::opt ProfileDensityThreshold( + "profile-density-threshold", cl::init(60), + cl::desc("If the profile density is below the given threshold, it " + "will be suggested to increase the sampling rate."), + cl::Optional); + } // namespace opts namespace llvm { @@ -1383,6 +1396,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { uint64_t StaleSampleCount = 0; uint64_t InferredSampleCount = 0; std::vector ProfiledFunctions; + std::vector> FuncDensityList; const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n"; for (auto &BFI : BC.getBinaryFunctions()) { const BinaryFunction &Function = BFI.second; @@ -1441,6 +1455,22 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { StaleSampleCount += SampleCount; ++NumAllStaleFunctions; } + + if (opts::ShowDensity) { + uint64_t Size = Function.getSize(); + // In case of BOLT split functions registered in BAT, executed traces are + // automatically attributed to the main fragment. Add up function sizes + // for all fragments. + if (IsHotParentOfBOLTSplitFunction) + for (const BinaryFunction *Fragment : Function.getFragments()) + Size += Fragment->getSize(); + double Density = (double)1.0 * Function.getSampleCountInBytes() / Size; + FuncDensityList.emplace_back(Density, SampleCount); + LLVM_DEBUG(BC.outs() << Function << ": executed bytes " + << Function.getSampleCountInBytes() << ", size (b) " + << Size << ", density " << Density + << ", sample count " << SampleCount << '\n'); + } } BC.NumProfiledFuncs = ProfiledFunctions.size(); BC.NumStaleProfileFuncs = NumStaleProfileFunctions; @@ -1684,6 +1714,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) { BC.outs() << ". Use -print-unknown to see the list."; BC.outs() << '\n'; } + + if (opts::ShowDensity) { + double Density = 0.0; + // Sorted by the density in descending order. + llvm::stable_sort(FuncDensityList, + [&](const std::pair &A, + const std::pair &B) { + if (A.first != B.first) + return A.first > B.first; + return A.second < B.second; + }); + + uint64_t AccumulatedSamples = 0; + uint32_t I = 0; + assert(opts::ProfileDensityCutOffHot <= 1000000 && + "The cutoff value is greater than 1000000(100%)"); + while (AccumulatedSamples < + TotalSampleCount * + static_cast(opts::ProfileDensityCutOffHot) / + 1000000 && + I < FuncDensityList.size()) { + AccumulatedSamples += FuncDensityList[I].second; + Density = FuncDensityList[I].first; + I++; + } + if (Density == 0.0) { + BC.errs() << "BOLT-WARNING: the output profile is empty or the " + "--profile-density-cutoff-hot option is " + "set too low. Please check your command.\n"; + } else if (Density < opts::ProfileDensityThreshold) { + BC.errs() + << "BOLT-WARNING: BOLT is estimated to optimize better with " + << format("%.1f", opts::ProfileDensityThreshold / Density) + << "x more samples. Please consider increasing sampling rate or " + "profiling for longer duration to get more samples.\n"; + } + + BC.outs() << "BOLT-INFO: Functions with density >= " + << format("%.1f", Density) << " account for " + << format("%.2f", + static_cast(opts::ProfileDensityCutOffHot) / + 10000) + << "% total sample counts.\n"; + } return Error::success(); } diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 2e83ca03dbde6..ffd693f9bbaed 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -849,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, return false; } + // Set ParentFunc to BAT parent function or FromFunc itself. + BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc); + if (!ParentFunc) + ParentFunc = FromFunc; + ParentFunc->SampleCountInBytes += Count * (Second.From - First.To); + std::optional FTs = BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To, Second.From) @@ -868,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, << FromFunc->getPrintName() << ":" << Twine::utohexstr(First.To) << " to " << Twine::utohexstr(Second.From) << ".\n"); - BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc); for (auto [From, To] : *FTs) { if (BAT) { From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true); To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false); } - doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false); + doIntraBranch(*ParentFunc, From, To, Count, false); } return true; diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index 435a8fa9cafca..de82420a16713 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -175,6 +175,10 @@ cl::opt SaveProfile("w", cl::desc("save recorded profile to a file"), cl::cat(BoltOutputCategory)); +cl::opt ShowDensity("show-density", + cl::desc("show profile density details"), + cl::Optional, cl::cat(AggregatorCategory)); + cl::opt SplitEH("split-eh", cl::desc("split C++ exception handling code"), cl::Hidden, cl::cat(BoltOptCategory)); diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test index 90252f9ff68da..3242ba22f5916 100644 --- a/bolt/test/X86/pre-aggregated-perf.test +++ b/bolt/test/X86/pre-aggregated-perf.test @@ -11,7 +11,19 @@ REQUIRES: system-linux RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \ -RUN: --profile-use-dfs | FileCheck %s +RUN: --profile-density-threshold=9 --profile-density-cutoff-hot=970000 \ +RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B + +CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile +CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts. + +RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \ +RUN: --profile-density-cutoff-hot=970000 \ +RUN: --profile-use-dfs 2>&1 | FileCheck %s --check-prefix=CHECK-WARNING + +CHECK-WARNING: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile +CHECK-WARNING: BOLT-WARNING: BOLT is estimated to optimize better with 2.8x more samples. +CHECK-WARNING: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts. RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s diff --git a/bolt/tools/driver/llvm-bolt.cpp b/bolt/tools/driver/llvm-bolt.cpp index a8d1ac6480893..efa06cd68cb99 100644 --- a/bolt/tools/driver/llvm-bolt.cpp +++ b/bolt/tools/driver/llvm-bolt.cpp @@ -129,6 +129,7 @@ void perf2boltMode(int argc, char **argv) { exit(1); } opts::AggregateOnly = true; + opts::ShowDensity = true; } void boltDiffMode(int argc, char **argv) {