diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h index bc95e2c4de3a1..32e48b964d739 100644 --- a/bolt/include/bolt/Core/BinaryBasicBlock.h +++ b/bolt/include/bolt/Core/BinaryBasicBlock.h @@ -677,9 +677,7 @@ class BinaryBasicBlock { return isSplit(); } - void setIsCold(const bool Flag) { - Fragment = Flag ? FragmentNum::cold() : FragmentNum::main(); - } + void setIsCold(const bool Flag); /// Return true if the block can be outlined. At the moment we disallow /// outlining of blocks that can potentially throw exceptions or are diff --git a/bolt/include/bolt/Core/BinaryContext.h b/bolt/include/bolt/Core/BinaryContext.h index ad1bf2baaeb5b..80cbcc37d2a21 100644 --- a/bolt/include/bolt/Core/BinaryContext.h +++ b/bolt/include/bolt/Core/BinaryContext.h @@ -927,6 +927,8 @@ class BinaryContext { const char *getMainCodeSectionName() const { return ".text"; } + const char *getWarmCodeSectionName() const { return ".text.warm"; } + const char *getColdCodeSectionName() const { return ".text.cold"; } const char *getHotTextMoverSectionName() const { return ".text.mover"; } @@ -1230,6 +1232,9 @@ class BinaryContext { /// /// Return the pair where the first size is for the main part, and the second /// size is for the cold one. + /// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the + /// function in place so that BB.OutputAddressRange.second less + /// BB.OutputAddressRange.first gives the emitted size of BB. std::pair calculateEmittedSize(BinaryFunction &BF, bool FixBranches = true); diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 72c360ca0c2db..31677fabae1d9 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -355,6 +355,9 @@ class BinaryFunction { /// Name for the section this function code should reside in. std::string CodeSectionName; + /// Name for the corresponding warm code section. + std::string WarmCodeSectionName; + /// Name for the corresponding cold code section. std::string ColdCodeSectionName; @@ -1231,13 +1234,7 @@ class BinaryFunction { /// Return internal section name for this function. SmallString<32> - getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const { - if (Fragment == FragmentNum::main()) - return SmallString<32>(CodeSectionName); - if (Fragment == FragmentNum::cold()) - return SmallString<32>(ColdCodeSectionName); - return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1); - } + getCodeSectionName(const FragmentNum Fragment = FragmentNum::main()) const; /// Assign a code section name to the function. void setCodeSectionName(const StringRef Name) { @@ -1250,6 +1247,11 @@ class BinaryFunction { return BC.getUniqueSectionByName(getCodeSectionName(Fragment)); } + /// Assign a section name for the warm part of the function. + void setWarmCodeSectionName(const StringRef Name) { + WarmCodeSectionName = Name.str(); + } + /// Assign a section name for the cold part of the function. void setColdCodeSectionName(const StringRef Name) { ColdCodeSectionName = Name.str(); @@ -1272,6 +1274,20 @@ class BinaryFunction { /// otherwise processed. bool isPseudo() const { return IsPseudo; } + /// Return true if every block in the function has a valid execution count. + bool hasFullProfile() const { + return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) { + return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; + }); + } + + /// Return true if every block in the function has a zero execution count. + bool allBlocksCold() const { + return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) { + return BB.getExecutionCount() == 0; + }); + } + /// Return true if the function contains explicit or implicit indirect branch /// to its split fragments, e.g., split jump table, landing pad in split /// fragment. diff --git a/bolt/include/bolt/Core/FunctionLayout.h b/bolt/include/bolt/Core/FunctionLayout.h index 904da3a4a93aa..94d71a84aae83 100644 --- a/bolt/include/bolt/Core/FunctionLayout.h +++ b/bolt/include/bolt/Core/FunctionLayout.h @@ -62,7 +62,10 @@ class FragmentNum { } static constexpr FragmentNum main() { return FragmentNum(0); } - static constexpr FragmentNum cold() { return FragmentNum(1); } + static constexpr FragmentNum warm() { return FragmentNum(1); } + static constexpr FragmentNum cold(bool Flag = false) { + return FragmentNum(Flag ? 2 : 1); + } }; /// A freestanding subset of contiguous blocks of a function. diff --git a/bolt/include/bolt/Passes/CDSplit.h b/bolt/include/bolt/Passes/CDSplit.h new file mode 100644 index 0000000000000..96a982683a7ec --- /dev/null +++ b/bolt/include/bolt/Passes/CDSplit.h @@ -0,0 +1,63 @@ +//===- bolt/Passes/CDSplit.h - Split functions into hot/warm/cold +// after function reordering pass -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_CDSPLIT +#define BOLT_PASSES_CDSPLIT + +#include "bolt/Passes/SplitFunctions.h" +#include + +namespace llvm { +namespace bolt { + +using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; + +class CDSplit : public BinaryFunctionPass { +private: + /// Overall stats. + std::atomic SplitBytesHot{0ull}; + std::atomic SplitBytesCold{0ull}; + + /// List of functions to be considered. + /// All functions in the list are used to construct a call graph. + /// A subset of functions in this list are considered for splitting. + std::vector FunctionsToConsider; + + /// Helper functions to initialize global variables. + void initialize(BinaryContext &BC); + + /// Split function body into 3 fragments: hot / warm / cold. + void runOnFunction(BinaryFunction &BF); + + /// Assign each basic block in the given function to either hot, cold, + /// or warm fragment using the CDSplit algorithm. + void assignFragmentThreeWay(const BinaryFunction &BF, + const BasicBlockOrder &BlockOrder); + + /// Find the best split index that separates hot from warm. + /// The basic block whose index equals the returned split index will be the + /// last hot block. + size_t findSplitIndex(const BinaryFunction &BF, + const BasicBlockOrder &BlockOrder); + +public: + explicit CDSplit(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + bool shouldOptimize(const BinaryFunction &BF) const override; + + const char *getName() const override { return "cdsplit"; } + + void runOnFunctions(BinaryContext &BC) override; +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/include/bolt/Passes/SplitFunctions.h b/bolt/include/bolt/Passes/SplitFunctions.h index 4058f3317dfbd..91b6d5518eaab 100644 --- a/bolt/include/bolt/Passes/SplitFunctions.h +++ b/bolt/include/bolt/Passes/SplitFunctions.h @@ -50,6 +50,19 @@ class SplitFunctions : public BinaryFunctionPass { /// Split function body into fragments. void splitFunction(BinaryFunction &Function, SplitStrategy &Strategy); + std::atomic SplitBytesHot{0ull}; + std::atomic SplitBytesCold{0ull}; + +public: + explicit SplitFunctions(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + bool shouldOptimize(const BinaryFunction &BF) const override; + + const char *getName() const override { return "split-functions"; } + + void runOnFunctions(BinaryContext &BC) override; + struct TrampolineKey { FragmentNum SourceFN = FragmentNum::main(); const MCSymbol *Target = nullptr; @@ -81,27 +94,14 @@ class SplitFunctions : public BinaryFunctionPass { /// corresponding thrower block. The trampoline landing pad, when created, /// will redirect the execution to the real landing pad in a different /// fragment. - TrampolineSetType createEHTrampolines(BinaryFunction &Function) const; + static TrampolineSetType createEHTrampolines(BinaryFunction &Function); /// Merge trampolines into \p Layout without trampolines. The merge will place /// a trampoline immediately before its destination. Used to revert the effect /// of trampolines after createEHTrampolines(). - BasicBlockOrderType + static BasicBlockOrderType mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout, - const TrampolineSetType &Trampolines) const; - - std::atomic SplitBytesHot{0ull}; - std::atomic SplitBytesCold{0ull}; - -public: - explicit SplitFunctions(const cl::opt &PrintPass) - : BinaryFunctionPass(PrintPass) {} - - bool shouldOptimize(const BinaryFunction &BF) const override; - - const char *getName() const override { return "split-functions"; } - - void runOnFunctions(BinaryContext &BC) override; + const TrampolineSetType &Trampolines); }; } // namespace bolt diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index 984bc6dbd220a..40a655c0cd24b 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -15,17 +15,25 @@ #include "bolt/Core/BinaryFunction.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/MC/MCInst.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Errc.h" #define DEBUG_TYPE "bolt" -namespace llvm { -namespace bolt { +using namespace llvm; +using namespace bolt; +namespace opts { +extern cl::opt UseCDSplit; +} constexpr uint32_t BinaryBasicBlock::INVALID_OFFSET; -bool operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { - return LHS.Index < RHS.Index; +bool bolt::operator<(const BinaryBasicBlock &LHS, const BinaryBasicBlock &RHS) { + return LHS.getIndex() < RHS.getIndex(); +} + +void BinaryBasicBlock::setIsCold(const bool Flag) { + Fragment = Flag ? FragmentNum::cold(opts::UseCDSplit) : FragmentNum::main(); } bool BinaryBasicBlock::hasCFG() const { return getParent()->hasCFG(); } @@ -611,6 +619,3 @@ BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { return NewBlock; } - -} // namespace bolt -} // namespace llvm diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 06b68765909d2..d04f00efd27ce 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -2331,14 +2331,37 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { MCAsmLayout Layout(Assembler); Assembler.layout(Layout); + // Obtain fragment sizes. + std::vector FragmentSizes; + // Main fragment size. const uint64_t HotSize = Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel); - const uint64_t ColdSize = - std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL, - [&](const uint64_t Accu, const LabelRange &Labels) { - return Accu + Layout.getSymbolOffset(*Labels.second) - - Layout.getSymbolOffset(*Labels.first); - }); + FragmentSizes.push_back(HotSize); + // Split fragment sizes. + uint64_t ColdSize = 0; + for (const auto &Labels : SplitLabels) { + uint64_t Size = Layout.getSymbolOffset(*Labels.second) - + Layout.getSymbolOffset(*Labels.first); + FragmentSizes.push_back(Size); + ColdSize += Size; + } + + // Populate new start and end offsets of each basic block. + BinaryBasicBlock *PrevBB = nullptr; + uint64_t FragmentIndex = 0; + for (FunctionFragment &FF : BF.getLayout().fragments()) { + for (BinaryBasicBlock *BB : FF) { + const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel())); + BB->setOutputStartAddress(BBStartOffset); + if (PrevBB) + PrevBB->setOutputEndAddress(BBStartOffset); + PrevBB = BB; + } + if (PrevBB) + PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]); + FragmentIndex++; + PrevBB = nullptr; + } // Clean-up the effect of the code emission. for (const MCSymbol &Symbol : Assembler.symbols()) { diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp index fb1bf530c1974..9829c6bc107f0 100644 --- a/bolt/lib/Core/BinaryEmitter.cpp +++ b/bolt/lib/Core/BinaryEmitter.cpp @@ -34,6 +34,7 @@ namespace opts { extern cl::opt JumpTables; extern cl::opt PreserveBlocksAlignment; +extern cl::opt UseCDSplit; cl::opt AlignBlocks("align-blocks", cl::desc("align basic blocks"), cl::cat(BoltOptCategory)); @@ -287,7 +288,10 @@ void BinaryEmitter::emitFunctions() { // Mark the end of hot text. if (opts::HotText) { - Streamer.switchSection(BC.getTextSection()); + if (opts::UseCDSplit) + Streamer.switchSection(BC.getCodeSection(BC.getWarmCodeSectionName())); + else + Streamer.switchSection(BC.getTextSection()); Streamer.emitLabel(BC.getHotTextEndSymbol()); } } diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index e81d58ef0b104..0b89be26def39 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -59,6 +59,7 @@ extern cl::opt EnableBAT; extern cl::opt Instrument; extern cl::opt StrictMode; extern cl::opt UpdateDebugSections; +extern cl::opt UseCDSplit; extern cl::opt Verbosity; extern bool processAllFunctions(); @@ -167,6 +168,18 @@ template static bool emptyRange(const R &Range) { return Range.begin() == Range.end(); } +/// Return internal section name for this function. +SmallString<32> +BinaryFunction::getCodeSectionName(const FragmentNum Fragment) const { + if (Fragment == FragmentNum::main()) + return SmallString<32>(CodeSectionName); + if (Fragment == FragmentNum::cold(opts::UseCDSplit)) + return SmallString<32>(ColdCodeSectionName); + if (Fragment == FragmentNum::warm()) + return SmallString<32>(WarmCodeSectionName); + return formatv("{0}.{1}", ColdCodeSectionName, Fragment.get() - 1); +} + /// Gets debug line information for the instruction located at the given /// address in the original binary. The SMLoc's pointer is used /// to point to this information, which is represented by a diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 4e1343e2c30be..c60a03d3d9847 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1244,8 +1244,10 @@ void AssignSections::runOnFunctions(BinaryContext &BC) { else Function.setCodeSectionName(BC.getColdCodeSectionName()); - if (Function.isSplit()) + if (Function.isSplit()) { + Function.setWarmCodeSectionName(BC.getWarmCodeSectionName()); Function.setColdCodeSectionName(BC.getColdCodeSectionName()); + } } } diff --git a/bolt/lib/Passes/CDSplit.cpp b/bolt/lib/Passes/CDSplit.cpp new file mode 100644 index 0000000000000..509d250c34191 --- /dev/null +++ b/bolt/lib/Passes/CDSplit.cpp @@ -0,0 +1,207 @@ +//===- bolt/Passes/CDSplit.cpp - Pass for splitting function code 3-way +//--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the CDSplit pass. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/CDSplit.h" +#include "bolt/Core/ParallelUtilities.h" +#include "bolt/Utils/CommandLineOpts.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/MC/MCInst.h" +#include "llvm/Support/MathExtras.h" + +#define DEBUG_TYPE "bolt-opts" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::OptionCategory BoltOptCategory; + +extern cl::opt UseCDSplit; +extern cl::opt SplitEH; +extern cl::opt ExecutionCountThreshold; +} // namespace opts + +namespace llvm { +namespace bolt { + +namespace { +/// Return true if the function should be considered for building call graph. +bool shouldConsider(const BinaryFunction &BF) { + return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); +} +} // anonymous namespace + +bool CDSplit::shouldOptimize(const BinaryFunction &BF) const { + // Do not split functions with a small execution count. + if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) + return false; + + // Do not split functions with at least one block that has no known + // execution count due to incomplete information. + // Do not split functions with only zero-execution count blocks + // as there is not enough variation in block count to justify splitting. + if (!BF.hasFullProfile() || BF.allBlocksCold()) + return false; + + return BinaryFunctionPass::shouldOptimize(BF); +} + +/// Initialize algorithm's metadata. +void CDSplit::initialize(BinaryContext &BC) { + // Construct a list of functions that are considered for building call graph. + // Only those in this list that evaluates true for shouldOptimize are + // candidates for 3-way splitting. + std::vector SortedFunctions = BC.getSortedFunctions(); + FunctionsToConsider.reserve(SortedFunctions.size()); + for (BinaryFunction *BF : SortedFunctions) { + if (shouldConsider(*BF)) + FunctionsToConsider.push_back(BF); + } +} + +/// Find the best index for splitting. The returned value is the index of the +/// last hot basic block. Hence, "no splitting" is equivalent to returning the +/// value which is one less than the size of the function. +size_t CDSplit::findSplitIndex(const BinaryFunction &BF, + const BasicBlockOrder &BlockOrder) { + // Placeholder: hot-cold splitting. + return BF.getLayout().getMainFragment().size() - 1; +} + +/// Assign each basic block in the given function to either hot, cold, +/// or warm fragment using the CDSplit algorithm. +void CDSplit::assignFragmentThreeWay(const BinaryFunction &BF, + const BasicBlockOrder &BlockOrder) { + size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); + + // Assign fragments based on the computed best split index. + // All basic blocks with index up to the best split index become hot. + // All remaining blocks are warm / cold depending on if count is + // greater than 0 or not. + FragmentNum Main(0); + FragmentNum Warm(1); + FragmentNum Cold(2); + for (size_t Index = 0; Index < BlockOrder.size(); Index++) { + BinaryBasicBlock *BB = BlockOrder[Index]; + if (Index <= BestSplitIndex) + BB->setFragmentNum(Main); + else + BB->setFragmentNum(BB->getKnownExecutionCount() > 0 ? Warm : Cold); + } +} + +void CDSplit::runOnFunction(BinaryFunction &BF) { + assert(!BF.empty() && "splitting an empty function"); + + FunctionLayout &Layout = BF.getLayout(); + BinaryContext &BC = BF.getBinaryContext(); + + BasicBlockOrder NewLayout(Layout.block_begin(), Layout.block_end()); + // Never outline the first basic block. + NewLayout.front()->setCanOutline(false); + for (BinaryBasicBlock *BB : NewLayout) { + if (!BB->canOutline()) + continue; + + // Do not split extra entry points in aarch64. They can be referred by + // using ADRs and when this happens, these blocks cannot be placed far + // away due to the limited range in ADR instruction. + if (BC.isAArch64() && BB->isEntryPoint()) { + BB->setCanOutline(false); + continue; + } + + if (BF.hasEHRanges() && !opts::SplitEH) { + // We cannot move landing pads (or rather entry points for landing pads). + if (BB->isLandingPad()) { + BB->setCanOutline(false); + continue; + } + // We cannot move a block that can throw since exception-handling + // runtime cannot deal with split functions. However, if we can guarantee + // that the block never throws, it is safe to move the block to + // decrease the size of the function. + for (MCInst &Instr : *BB) { + if (BC.MIB->isInvoke(Instr)) { + BB->setCanOutline(false); + break; + } + } + } + } + + // Assign each basic block in NewLayout to either hot, warm, or cold fragment. + assignFragmentThreeWay(BF, NewLayout); + + // Make sure all non-outlineable blocks are in the main-fragment. + for (BinaryBasicBlock *BB : NewLayout) { + if (!BB->canOutline()) + BB->setFragmentNum(FragmentNum::main()); + } + + // In case any non-outlineable blocks previously in warm or cold is now set + // to be in main by the preceding for loop, move them to the end of main. + llvm::stable_sort(NewLayout, + [&](const BinaryBasicBlock *L, const BinaryBasicBlock *R) { + return L->getFragmentNum() < R->getFragmentNum(); + }); + + BF.getLayout().update(NewLayout); + + // For shared objects, invoke instructions and corresponding landing pads + // have to be placed in the same fragment. When we split them, create + // trampoline landing pads that will redirect the execution to real LPs. + SplitFunctions::TrampolineSetType Trampolines; + if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) + Trampolines = SplitFunctions::createEHTrampolines(BF); + + if (BC.isX86() && BF.isSplit()) { + size_t HotSize; + size_t ColdSize; + std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); + SplitBytesHot += HotSize; + SplitBytesCold += ColdSize; + } +} + +void CDSplit::runOnFunctions(BinaryContext &BC) { + if (!opts::UseCDSplit) + return; + + // Initialize global variables. + initialize(BC); + + // Only functions satisfying shouldConsider and shouldOptimize are candidates + // for splitting. + ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { + return !(shouldConsider(BF) && shouldOptimize(BF)); + }; + + // Make function splitting decisions in parallel. + ParallelUtilities::runOnEachFunction( + BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, + [&](BinaryFunction &BF) { runOnFunction(BF); }, SkipFunc, "CDSplit", + /*ForceSequential=*/false); + + if (SplitBytesHot + SplitBytesCold > 0) { + outs() << "BOLT-INFO: cdsplit separates " << SplitBytesHot + << " hot bytes from " << SplitBytesCold << " cold bytes " + << format("(%.2lf%% of split functions is in the main fragment)\n", + 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); + + } else + outs() << "BOLT-INFO: cdsplit didn't split any functions\n"; +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index b8bbe59a64480..4cc4b4fa6ae34 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -9,6 +9,7 @@ add_llvm_library(LLVMBOLTPasses CacheMetrics.cpp CallGraph.cpp CallGraphWalker.cpp + CDSplit.cpp DataflowAnalysis.cpp DataflowInfoManager.cpp FrameAnalysis.cpp diff --git a/bolt/lib/Passes/IndirectCallPromotion.cpp b/bolt/lib/Passes/IndirectCallPromotion.cpp index 8b084c3b63d56..a46df74d7fd7b 100644 --- a/bolt/lib/Passes/IndirectCallPromotion.cpp +++ b/bolt/lib/Passes/IndirectCallPromotion.cpp @@ -34,6 +34,7 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt ICP; extern cl::opt Verbosity; extern cl::opt ExecutionCountThreshold; +extern cl::opt UseCDSplit; static cl::opt ICPJTRemainingPercentThreshold( "icp-jt-remaining-percent-threshold", @@ -259,9 +260,10 @@ IndirectCallPromotion::getCallTargets(BinaryBasicBlock &BB, MCSymbol *Entry = JT->Entries[I]; const BinaryBasicBlock *ToBB = BF.getBasicBlockForLabel(Entry); assert(ToBB || Entry == BF.getFunctionEndLabel() || - Entry == BF.getFunctionEndLabel(FragmentNum::cold())); + Entry == + BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit))); if (Entry == BF.getFunctionEndLabel() || - Entry == BF.getFunctionEndLabel(FragmentNum::cold())) + Entry == BF.getFunctionEndLabel(FragmentNum::cold(opts::UseCDSplit))) continue; const Location To(Entry); const BinaryBasicBlock::BinaryBranchInfo &BI = BB.getBranchInfo(*ToBB); diff --git a/bolt/lib/Passes/SplitFunctions.cpp b/bolt/lib/Passes/SplitFunctions.cpp index 34973cecdf491..a5eb19724c314 100644 --- a/bolt/lib/Passes/SplitFunctions.cpp +++ b/bolt/lib/Passes/SplitFunctions.cpp @@ -60,6 +60,7 @@ extern cl::OptionCategory BoltOptCategory; extern cl::opt SplitEH; extern cl::opt ExecutionCountThreshold; extern cl::opt RandomSeed; +extern cl::opt UseCDSplit; static cl::opt AggressiveSplitting( "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), @@ -109,29 +110,17 @@ static cl::opt SplitStrategy( } // namespace opts namespace { -bool hasFullProfile(const BinaryFunction &BF) { - return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { - return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; - }); -} - -bool allBlocksCold(const BinaryFunction &BF) { - return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { - return BB.getExecutionCount() == 0; - }); -} - struct SplitProfile2 final : public SplitStrategy { bool canSplit(const BinaryFunction &BF) override { - return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); + return BF.hasValidProfile() && BF.hasFullProfile() && !BF.allBlocksCold(); } - bool keepEmpty() override { return false; } + bool keepEmpty() override { return opts::UseCDSplit ? true : false; } void fragment(const BlockIt Start, const BlockIt End) override { for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { if (BB->getExecutionCount() == 0) - BB->setFragmentNum(FragmentNum::cold()); + BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit)); } } }; @@ -155,7 +144,7 @@ struct SplitRandom2 final : public SplitStrategy { std::uniform_int_distribution Dist(1, LastSplitPoint); const DiffT SplitPoint = Dist(Gen); for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) - BB->setFragmentNum(FragmentNum::cold()); + BB->setFragmentNum(FragmentNum::cold(opts::UseCDSplit)); LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " "{1} possible) blocks to split\n", @@ -243,6 +232,17 @@ bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { } void SplitFunctions::runOnFunctions(BinaryContext &BC) { + if (opts::UseCDSplit && + !(opts::SplitFunctions && + opts::SplitStrategy == SplitFunctionsStrategy::Profile2)) { + errs() << "BOLT-ERROR: -use-cdsplit should be applied together with " + "-split-functions using default -split-strategy=profile2. " + "-split-functions 2-way splits functions before the function " + "reordering pass, while -use-cdsplit 3-way splits functions " + "after the function reordering pass. \n"; + exit(1); + } + if (!opts::SplitFunctions) return; @@ -434,7 +434,7 @@ void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { } SplitFunctions::TrampolineSetType -SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { +SplitFunctions::createEHTrampolines(BinaryFunction &BF) { const auto &MIB = BF.getBinaryContext().MIB; // Map real landing pads to the corresponding trampolines. @@ -501,7 +501,7 @@ SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, - const SplitFunctions::TrampolineSetType &Trampolines) const { + const SplitFunctions::TrampolineSetType &Trampolines) { DenseMap> IncomingTrampolines; for (const auto &Entry : Trampolines) { diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 37de3eabc6d23..28983de11c3ae 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -11,6 +11,7 @@ #include "bolt/Passes/Aligner.h" #include "bolt/Passes/AllocCombiner.h" #include "bolt/Passes/AsmDump.h" +#include "bolt/Passes/CDSplit.h" #include "bolt/Passes/CMOVConversion.h" #include "bolt/Passes/FixRISCVCallsPass.h" #include "bolt/Passes/FixRelaxationPass.h" @@ -182,6 +183,10 @@ static cl::opt PrintSplit("print-split", cl::desc("print functions after code splitting"), cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt PrintCDSplit("print-cdsplit", + cl::desc("print functions after cdsplit"), + cl::Hidden, cl::cat(BoltOptCategory)); + static cl::opt PrintStoke("print-stoke", cl::desc("print functions after stoke analysis"), cl::Hidden, cl::cat(BoltOptCategory)); @@ -430,6 +435,11 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass( std::make_unique(PrintReorderedFunctions)); + /// This pass three-way splits functions after function reordering. + Manager.registerPass(std::make_unique(PrintCDSplit)); + + Manager.registerPass(std::make_unique(PrintAfterBranchFixup)); + // Print final dyno stats right while CFG and instruction analysis are intact. Manager.registerPass( std::make_unique( diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index 81c9cbff726bb..6d0b3cd5d1532 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -84,6 +84,7 @@ extern cl::opt JumpTables; extern cl::list ReorderData; extern cl::opt ReorderFunctions; extern cl::opt TimeBuild; +extern cl::opt UseCDSplit; cl::opt AllowStripped("allow-stripped", cl::desc("allow processing of stripped binaries"), @@ -3478,11 +3479,21 @@ std::vector RewriteInstance::getCodeSections() { if (B->getName() == BC->getHotTextMoverSectionName()) return false; - // Depending on the option, put main text at the beginning or at the end. - if (opts::HotFunctionsAtEnd) - return B->getName() == BC->getMainCodeSectionName(); - else - return A->getName() == BC->getMainCodeSectionName(); + // Depending on opts::HotFunctionsAtEnd, place main and warm sections in + // order. + if (opts::HotFunctionsAtEnd) { + if (B->getName() == BC->getMainCodeSectionName()) + return true; + if (A->getName() == BC->getMainCodeSectionName()) + return false; + return (B->getName() == BC->getWarmCodeSectionName()); + } else { + if (A->getName() == BC->getMainCodeSectionName()) + return true; + if (B->getName() == BC->getMainCodeSectionName()) + return false; + return (A->getName() == BC->getWarmCodeSectionName()); + } }; // Determine the order of sections. @@ -3639,7 +3650,7 @@ void RewriteInstance::mapCodeSections(BOLTLinker::SectionMapper MapSection) { "non-relocation mode."); FunctionFragment &FF = - Function.getLayout().getFragment(FragmentNum::cold()); + Function.getLayout().getFragment(FragmentNum::cold(opts::UseCDSplit)); ErrorOr ColdSection = Function.getCodeSection(FF.getFragmentNum()); assert(ColdSection && "cannot find section for cold part"); @@ -4423,9 +4434,15 @@ void RewriteInstance::updateELFSymbolTable( Function.getLayout().getSplitFragments()) { if (FF.getAddress()) { ELFSymTy NewColdSym = FunctionSymbol; - const SmallString<256> SymbolName = formatv( - "{0}.cold.{1}", cantFail(FunctionSymbol.getName(StringSection)), - FF.getFragmentNum().get() - 1); + SmallString<256> SymbolName; + if (opts::UseCDSplit) + SymbolName = formatv( + "{0}.{1}", cantFail(FunctionSymbol.getName(StringSection)), + FF.getFragmentNum().get() == 1 ? "warm" : "cold"); + else + SymbolName = formatv( + "{0}.cold.{1}", cantFail(FunctionSymbol.getName(StringSection)), + FF.getFragmentNum().get() - 1); NewColdSym.st_name = AddToStrTab(SymbolName); NewColdSym.st_shndx = Function.getCodeSection(FF.getFragmentNum())->getIndex(); @@ -4684,8 +4701,8 @@ void RewriteInstance::updateELFSymbolTable( SmallVector Buf; NewColdSym.st_name = AddToStrTab( Twine(Function->getPrintName()).concat(".cold.0").toStringRef(Buf)); - const FunctionFragment &ColdFF = - Function->getLayout().getFragment(FragmentNum::cold()); + const FunctionFragment &ColdFF = Function->getLayout().getFragment( + FragmentNum::cold(opts::UseCDSplit)); NewColdSym.st_value = ColdFF.getAddress(); NewColdSym.st_size = ColdFF.getImageSize(); Symbols.emplace_back(NewColdSym); diff --git a/bolt/lib/Utils/CommandLineOpts.cpp b/bolt/lib/Utils/CommandLineOpts.cpp index a1df5de262340..75d63e369c731 100644 --- a/bolt/lib/Utils/CommandLineOpts.cpp +++ b/bolt/lib/Utils/CommandLineOpts.cpp @@ -191,6 +191,12 @@ cl::opt cl::init(0), cl::ZeroOrMore, cl::cat(BoltCategory), cl::sub(cl::SubCommand::getAll())); +cl::opt + UseCDSplit("use-cdsplit", + cl::desc("split functions into 3 fragments using the CDSplit " + "algorithm after function reordering pass"), + cl::init(false), cl::cat(BoltOptCategory)); + bool processAllFunctions() { if (opts::AggregateOnly) return false;