From cc2fbc648d7babbfa612f4f5eda3160212ef6ca7 Mon Sep 17 00:00:00 2001 From: spupyrev Date: Wed, 25 Oct 2023 07:52:26 -0700 Subject: [PATCH] [CodeLayout] Faster basic block reordering, ext-tsp (#68617) Aggressive inlining might produce huge functions with >10K of basic blocks. Since BFI treats _all_ blocks and jumps as "hot" having non-negative (but perhaps small) weight, the current implementation can be slow, taking minutes to produce an layout. This change introduces a few modifications that significantly (up to 50x on some instances) speeds up the computation. Some notable changes: - reduced the maximum chain size to 512 (from the prior 4096); - introduced MaxMergeDensityRatio param to avoid merging chains with very different densities; - dropped a couple of params that seem unnecessary. Looking at some "offline" metrics (e.g., the number of created fall-throughs), there shouldn't be problems; in fact, I do see some metrics go up. But it might be hard/impossible to measure perf difference for such small changes. I did test the performance clang-14 binary and do not record a perf or i-cache-related differences. My 5 benchmarks, with ext-tsp runtime (the lower the better) and "tsp-score" (the higher the better). **Before**: - benchmark 1: num functions: 13,047 reordering running time is 2.4 seconds score: 125503458 (128.3102%) - benchmark 2: num functions: 16,438 reordering running time is 3.4 seconds score: 12613997277 (129.7495%) - benchmark 3: num functions: 12,359 reordering running time is 1.9 seconds score: 1315881613 (105.8991%) - benchmark 4: num functions: 96,588 reordering running time is 7.3 seconds score: 89513906284 (100.3413%) - benchmark 5: num functions: 1 reordering running time is 372 seconds score: 21292505965077 (99.9979%) - benchmark 6: num functions: 71,155 reordering running time is 314 seconds score: 29795381626270671437824 (102.7519%) **After**: - benchmark 1: reordering running time is 2.2 seconds score: 125510418 (128.3130%) - benchmark 2: reordering running time is 2.6 seconds score: 12614502162 (129.7525%) - benchmark 3: reordering running time is 1.6 seconds score: 1315938168 (105.9024%) - benchmark 4: reordering running time is 4.9 seconds score: 89518095837 (100.3454%) - benchmark 5: reordering running time is 4.8 seconds score: 21292295939119 (99.9971%) - benchmark 6: reordering running time is 104 seconds score: 29796710925310302879744 (102.7565%) --- llvm/lib/Transforms/Utils/CodeLayout.cpp | 119 ++++++++++-------- .../CodeGen/X86/code_placement_ext_tsp.ll | 13 +- .../X86/code_placement_ext_tsp_large.ll | 2 +- 3 files changed, 69 insertions(+), 65 deletions(-) diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index 4801a55e3f11f..9a5909c97b0fd 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -101,8 +101,8 @@ static cl::opt BackwardDistance( // The maximum size of a chain created by the algorithm. The size is bounded // so that the algorithm can efficiently process extremely large instances. static cl::opt - MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096), - cl::desc("The maximum size of a chain to create.")); + MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512), + cl::desc("The maximum size of a chain to create")); // The maximum size of a chain for splitting. Larger values of the threshold // may yield better quality at the cost of worsen run-time. @@ -110,11 +110,10 @@ static cl::opt ChainSplitThreshold( "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128), cl::desc("The maximum size of a chain to apply splitting")); -// The option enables splitting (large) chains along in-coming and out-going -// jumps. This typically results in a better quality. -static cl::opt EnableChainSplitAlongJumps( - "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true), - cl::desc("The maximum size of a chain to apply splitting")); +// The maximum ratio between densities of two chains for merging. +static cl::opt MaxMergeDensityRatio( + "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100), + cl::desc("The maximum ratio between densities of two chains for merging")); // Algorithm-specific options for CDS. static cl::opt CacheEntries("cds-cache-entries", cl::ReallyHidden, @@ -226,6 +225,9 @@ struct NodeT { bool isEntry() const { return Index == 0; } + // Check if Other is a successor of the node. + bool isSuccessor(const NodeT *Other) const; + // The total execution count of outgoing jumps. uint64_t outCount() const; @@ -289,7 +291,7 @@ struct ChainT { size_t numBlocks() const { return Nodes.size(); } - double density() const { return static_cast(ExecutionCount) / Size; } + double density() const { return ExecutionCount / Size; } bool isEntry() const { return Nodes[0]->Index == 0; } @@ -350,8 +352,9 @@ struct ChainT { uint64_t Id; // Cached ext-tsp score for the chain. double Score{0}; - // The total execution count of the chain. - uint64_t ExecutionCount{0}; + // The total execution count of the chain. Since the execution count of + // a basic block is uint64_t, using doubles here to avoid overflow. + double ExecutionCount{0}; // The total size of the chain. uint64_t Size{0}; // Nodes of the chain. @@ -446,6 +449,13 @@ struct ChainEdge { bool CacheValidBackward{false}; }; +bool NodeT::isSuccessor(const NodeT *Other) const { + for (JumpT *Jump : OutJumps) + if (Jump->Target == Other) + return true; + return false; +} + uint64_t NodeT::outCount() const { uint64_t Count = 0; for (JumpT *Jump : OutJumps) @@ -514,8 +524,6 @@ struct MergedNodesT { const NodeT *getFirstNode() const { return *Begin1; } - bool empty() const { return Begin1 == End1; } - private: NodeIter Begin1; NodeIter End1; @@ -639,7 +647,8 @@ class ExtTSPImpl { } } for (JumpT &Jump : AllJumps) { - assert(OutDegree[Jump.Source->Index] > 0); + assert(OutDegree[Jump.Source->Index] > 0 && + "incorrectly computed out-degree of the block"); Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; } @@ -741,12 +750,23 @@ class ExtTSPImpl { // Get candidates for merging with the current chain. for (const auto &[ChainSucc, Edge] : ChainPred->Edges) { // Ignore loop edges. - if (ChainPred == ChainSucc) + if (Edge->isSelfEdge()) continue; - - // Stop early if the combined chain violates the maximum allowed size. + // Skip the merge if the combined chain violates the maximum specified + // size. if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize) continue; + // Don't merge the chains if they have vastly different densities. + // Skip the merge if the ratio between the densities exceeds + // MaxMergeDensityRatio. Smaller values of the option result in fewer + // merges, and hence, more chains. + auto [minDensity, maxDensity] = + std::minmax(ChainPred->density(), ChainSucc->density()); + assert(minDensity > 0.0 && maxDensity > 0.0 && + "incorrectly computed chain densities"); + const double Ratio = maxDensity / minDensity; + if (Ratio > MaxMergeDensityRatio) + continue; // Compute the gain of merging the two chains. MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge); @@ -858,36 +878,42 @@ class ExtTSPImpl { Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y)); - if (EnableChainSplitAlongJumps) { - // Attach (a part of) ChainPred before the first node of ChainSucc. - for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { - const NodeT *SrcBlock = Jump->Source; - if (SrcBlock->CurChain != ChainPred) - continue; - size_t Offset = SrcBlock->CurIndex + 1; - tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); - } + // Attach (a part of) ChainPred before the first node of ChainSucc. + for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { + const NodeT *SrcBlock = Jump->Source; + if (SrcBlock->CurChain != ChainPred) + continue; + size_t Offset = SrcBlock->CurIndex + 1; + tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); + } - // Attach (a part of) ChainPred after the last node of ChainSucc. - for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { - const NodeT *DstBlock = Jump->Target; - if (DstBlock->CurChain != ChainPred) - continue; - size_t Offset = DstBlock->CurIndex; - tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1}); - } + // Attach (a part of) ChainPred after the last node of ChainSucc. + for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { + const NodeT *DstBlock = Jump->Target; + if (DstBlock->CurChain != ChainPred) + continue; + size_t Offset = DstBlock->CurIndex; + tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1}); } // Try to break ChainPred in various ways and concatenate with ChainSucc. if (ChainPred->Nodes.size() <= ChainSplitThreshold) { for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) { - // Try to split the chain in different ways. In practice, applying - // X2_Y_X1 merging is almost never provides benefits; thus, we exclude - // it from consideration to reduce the search space. + // Do not split the chain along a fall-through jump. One of the two + // loops above may still "break" such a jump whenever it results in a + // new fall-through. + const NodeT *BB = ChainPred->Nodes[Offset - 1]; + const NodeT *BB2 = ChainPred->Nodes[Offset]; + if (BB->isSuccessor(BB2)) + continue; + + // In practice, applying X2_Y_X1 merging almost never provides benefits; + // thus, we exclude it from consideration to reduce the search space. tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1, MergeTypeT::X2_X1_Y}); } } + Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); return Gain; } @@ -946,22 +972,11 @@ class ExtTSPImpl { /// Concatenate all chains into the final order. std::vector concatChains() { - // Collect chains and calculate density stats for their sorting. + // Collect non-empty chains. std::vector SortedChains; - DenseMap ChainDensity; for (ChainT &Chain : AllChains) { - if (!Chain.Nodes.empty()) { + if (!Chain.Nodes.empty()) SortedChains.push_back(&Chain); - // Using doubles to avoid overflow of ExecutionCounts. - double Size = 0; - double ExecutionCount = 0; - for (NodeT *Node : Chain.Nodes) { - Size += static_cast(Node->Size); - ExecutionCount += static_cast(Node->ExecutionCount); - } - assert(Size > 0 && "a chain of zero size"); - ChainDensity[&Chain] = ExecutionCount / Size; - } } // Sorting chains by density in the decreasing order. @@ -971,11 +986,9 @@ class ExtTSPImpl { if (L->isEntry() != R->isEntry()) return L->isEntry(); - const double DL = ChainDensity[L]; - const double DR = ChainDensity[R]; // Compare by density and break ties by chain identifiers. - return std::make_tuple(-DL, L->Id) < - std::make_tuple(-DR, R->Id); + return std::make_tuple(-L->density(), L->Id) < + std::make_tuple(-R->density(), R->Id); }); // Collect the nodes in the order specified by their chains. diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll index 4053b8a8e123b..be0b9820e1454 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp.ll @@ -1,6 +1,5 @@ ;; See also llvm/unittests/Transforms/Utils/CodeLayoutTest.cpp ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 < %s | FileCheck %s -; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -enable-ext-tsp-block-placement=1 -ext-tsp-chain-split-threshold=0 -ext-tsp-enable-chain-split-along-jumps=0 < %s | FileCheck %s -check-prefix=CHECK2 define void @func1a() { ; Test that the algorithm positions the most likely successor first @@ -329,8 +328,8 @@ end: } define void @func4() !prof !11 { -; Test verifying that, if enabled, chains can be split in order to improve the -; objective (by creating more fallthroughs) +; Test verifying that chains can be split in order to improve the objective +; by creating more fallthroughs ; ; +-------+ ; | entry |--------+ @@ -354,19 +353,11 @@ define void @func4() !prof !11 { ; | b2 | <+ ----+ ; +-------+ ; -; With chain splitting enabled: ; CHECK-LABEL: func4: ; CHECK: entry ; CHECK: b1 ; CHECK: b3 ; CHECK: b2 -; -; With chain splitting disabled: -; CHECK2-LABEL: func4: -; CHECK2: entry -; CHECK2: b1 -; CHECK2: b2 -; CHECK2: b3 entry: call void @b() diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll index bb081f6bab532..842aced4884f7 100644 --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll @@ -6,7 +6,7 @@ @yydebug = dso_local global i32 0, align 4 define void @func_large() !prof !0 { -; A largee CFG instance where chain splitting helps to +; A large CFG instance where chain splitting helps to ; compute a better basic block ordering. The test verifies that with chain ; splitting, the resulting layout is improved (e.g., the score is increased). ;