Skip to content

Commit

Permalink
[BOLT] Tail duplication analysis pass
Browse files Browse the repository at this point in the history
Summary:
Created a binary pass that records how many
times tail duplication would be used and how many cache
misses it would theoretically stop

(cherry picked from FBD29619858)
  • Loading branch information
jthamanfb authored and maksfb committed Jul 1, 2021
1 parent 60b1506 commit 2f46660
Show file tree
Hide file tree
Showing 5 changed files with 291 additions and 0 deletions.
9 changes: 9 additions & 0 deletions bolt/src/BinaryPassManager.cpp
Expand Up @@ -27,6 +27,7 @@
#include "Passes/RetpolineInsertion.h"
#include "Passes/SplitFunctions.h"
#include "Passes/StokeInfo.h"
#include "Passes/TailDuplication.h"
#include "Passes/ValidateInternalCalls.h"
#include "Passes/VeneerElimination.h"
#include "llvm/Support/FormatVariadic.h"
Expand Down Expand Up @@ -76,6 +77,11 @@ JTFootprintReductionFlag("jt-footprint-reduction",
cl::ZeroOrMore,
cl::cat(BoltOptCategory));

static cl::opt<bool> TailDuplicationFlag(
"tail-duplication",
cl::desc("duplicate unconditional branches that cross a cache line"),
cl::ZeroOrMore, cl::ReallyHidden, cl::cat(BoltOptCategory));

static cl::opt<bool>
PrintJTFootprintReduction("print-after-jt-footprint-reduction",
cl::desc("print function after jt-footprint-reduction pass"),
Expand Down Expand Up @@ -449,6 +455,9 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {

Manager.registerPass(std::make_unique<LoopInversionPass>());

Manager.registerPass(std::make_unique<TailDuplication>(),
opts::TailDuplicationFlag);

// This pass syncs local branches with CFG. If any of the following
// passes breaks the sync - they either need to re-run the pass or
// fix branches consistency internally.
Expand Down
1 change: 1 addition & 0 deletions bolt/src/Passes/CMakeLists.txt
Expand Up @@ -36,6 +36,7 @@ add_llvm_library(LLVMBOLTPasses
StackPointerTracking.cpp
StackReachingUses.cpp
StokeInfo.cpp
TailDuplication.cpp
ValidateInternalCalls.cpp
VeneerElimination.cpp
RetpolineInsertion.cpp
Expand Down
172 changes: 172 additions & 0 deletions bolt/src/Passes/TailDuplication.cpp
@@ -0,0 +1,172 @@
//===--------- Passes/TailDuplication.cpp -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#include "TailDuplication.h"

#include <numeric>

using namespace llvm;

namespace opts {

extern cl::OptionCategory BoltOptCategory;

static cl::opt<bool> TailDuplicationAggressive(
"tail-duplication-aggressive",
cl::desc("tail duplication should act aggressively in duplicating multiple "
"blocks per tail"),
cl::ZeroOrMore, cl::ReallyHidden, cl::init(false),
cl::cat(BoltOptCategory));

static cl::opt<unsigned>
TailDuplicationMinimumOffset("tail-duplication-minimum-offset",
cl::desc("minimum offset needed between block "
"and successor to allow duplication"),
cl::ZeroOrMore, cl::ReallyHidden, cl::init(64),
cl::cat(BoltOptCategory));

static cl::opt<unsigned> TailDuplicationMaximumDuplication(
"tail-duplication-maximum-duplication",
cl::desc("maximum size of duplicated blocks (in bytes)"), cl::ZeroOrMore,
cl::ReallyHidden, cl::init(64), cl::cat(BoltOptCategory));

} // namespace opts

namespace llvm {
namespace bolt {
bool TailDuplication::isInCacheLine(const BinaryBasicBlock &BB,
const BinaryBasicBlock &Succ) const {
if (&BB == &Succ)
return true;

BinaryFunction::BasicBlockOrderType BlockLayout =
BB.getFunction()->getLayout();
uint64_t Distance = 0;
int Direction = (Succ.getLayoutIndex() > BB.getLayoutIndex()) ? 1 : -1;

for (unsigned I = BB.getLayoutIndex() + Direction; I != Succ.getLayoutIndex();
I += Direction) {
Distance += BlockLayout[I]->getOriginalSize();
if (Distance > opts::TailDuplicationMinimumOffset)
return false;
}
return true;
}

std::vector<BinaryBasicBlock *>
TailDuplication::moderateCodeToDuplicate(BinaryBasicBlock &BB) const {
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
for (auto Itr = BB.succ_begin(); Itr != BB.succ_end(); ++Itr) {
if ((*Itr)->getLayoutIndex() == BB.getLayoutIndex() + 1) {
// If duplicating would introduce a new branch, don't duplicate
return BlocksToDuplicate;
}
}
BlocksToDuplicate.push_back(&BB);
return BlocksToDuplicate;
}

std::vector<BinaryBasicBlock *>
TailDuplication::aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const {
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
BinaryBasicBlock *CurrBB = &BB;
while (CurrBB) {
BlocksToDuplicate.push_back(CurrBB);
// With no successors, we've reached the end and should duplicate all of
// BlocksToDuplicate
if (CurrBB->succ_size() == 0)
break;

// With two successors, if they're both a jump, we should duplicate all
// blocks in BlocksToDuplicate. Otherwise, we cannot find a simple stream of
// blocks to copy
if (CurrBB->succ_size() >= 2) {
if (CurrBB->getConditionalSuccessor(false)->getLayoutIndex() ==
CurrBB->getLayoutIndex() + 1 ||
CurrBB->getConditionalSuccessor(true)->getLayoutIndex() ==
CurrBB->getLayoutIndex() + 1)
BlocksToDuplicate.clear();
break;
}

// With one successor, if its a jump, we should duplicate all blocks in
// BlocksToDuplicate. Otherwise, we should keep going
BinaryBasicBlock *Succ = CurrBB->getSuccessor();
if (Succ->getLayoutIndex() != CurrBB->getLayoutIndex() + 1)
break;
CurrBB = Succ;
}
// Don't duplicate if its too much code
unsigned DuplicationByteCount = std::accumulate(
std::begin(BlocksToDuplicate), std::end(BlocksToDuplicate), 0,
[](int value, BinaryBasicBlock *p) {
return value + p->getOutputSize();
});
if (DuplicationByteCount < opts::TailDuplicationMaximumDuplication)
BlocksToDuplicate.clear();
return BlocksToDuplicate;
}

void TailDuplication::runOnFunction(BinaryFunction &Function) {
for (BinaryBasicBlock *BB : Function.layout()) {
if (BB->succ_size() == 1 &&
BB->getSuccessor()->getLayoutIndex() != BB->getLayoutIndex() + 1)
UnconditionalBranchDynamicCount += BB->getExecutionCount();
if (BB->succ_size() == 2 &&
BB->getFallthrough()->getLayoutIndex() != BB->getLayoutIndex() + 1)
UnconditionalBranchDynamicCount += BB->getFallthroughBranchInfo().Count;
AllBlocksDynamicCount += BB->getExecutionCount();

// The block must be hot
if (BB->getExecutionCount() == 0)
continue;
// with one successor
if (BB->succ_size() != 1)
continue;
// and that one successor is not a direct fallthrough
BinaryBasicBlock *Succ = BB->getSuccessor();
if (isInCacheLine(*BB, *Succ))
continue;
std::vector<BinaryBasicBlock *> BlocksToDuplicate;
if (opts::TailDuplicationAggressive)
BlocksToDuplicate = aggressiveCodeToDuplicate(*Succ);
else
BlocksToDuplicate = moderateCodeToDuplicate(*Succ);
if (BlocksToDuplicate.size() > 0) {
PossibleDuplications++;
PossibleDuplicationsDynamicCount += BB->getExecutionCount();
}
}
}

void TailDuplication::runOnFunctions(BinaryContext &BC) {
for (auto &It : BC.getBinaryFunctions()) {
BinaryFunction &Function = It.second;
runOnFunction(Function);
}

outs() << "BOLT-INFO: tail duplication possible duplications: "
<< PossibleDuplications << "\n";
outs() << "BOLT-INFO: tail duplication possible dynamic reductions: "
<< PossibleDuplicationsDynamicCount << "\n";
outs() << "BOLT-INFO: tail duplication possible dynamic reductions to "
"unconditional branch execution : "
<< format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) /
UnconditionalBranchDynamicCount)
<< "%\n";
outs() << "BOLT-INFO: tail duplication possible dynamic reductions to all "
"blocks execution : "
<< format("%.1f", ((float)PossibleDuplicationsDynamicCount * 100.0f) /
AllBlocksDynamicCount)
<< "%\n";
}

} // end namespace bolt
} // end namespace llvm
84 changes: 84 additions & 0 deletions bolt/src/Passes/TailDuplication.h
@@ -0,0 +1,84 @@
//===--------- Passes/TailDuplication.h ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_TAILDUPLICATION_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_TAILDUPLICATION_H

#include "BinaryPasses.h"

// This pass founds cases when BBs have layout:
// #BB0:
// <body>
// jmp #BB2
// ....
// #BB1
// <body>
// #BB2:
// <body>
//
// And duplicates #BB2 and puts it after #BB0:
// #BB0:
// <body>
// #BB2:
// <body>
// ....
// #BB1
// <body>
// #BB2:
// <body>
//
// The advantage is getting rid of an unconditional branch and hopefully to
// improve i-cache performance by reducing fragmentation The disadvantage is
// that if there is too much code duplication, we may end up evicting hot cache
// lines and causing the opposite effect, hurting i-cache performance This needs
// to be well balanced to achieve the optimal effect

namespace llvm {
namespace bolt {

/// Pass for duplicating blocks that would require a jump.
class TailDuplication : public BinaryFunctionPass {
/// Record how many possible tail duplications there can be.
uint64_t PossibleDuplications = 0;

/// Record how many times these duplications would get used.
uint64_t PossibleDuplicationsDynamicCount = 0;

/// Record the execution count of all unconditional branches
uint64_t UnconditionalBranchDynamicCount = 0;

/// Record the execution count of all blocks
uint64_t AllBlocksDynamicCount = 0;

/// True if Succ is in the same cache line as BB (approximately)
bool isInCacheLine(const BinaryBasicBlock &BB,
const BinaryBasicBlock &Succ) const;

/// Returns a vector of BinaryBasicBlock to copy after BB. If it's empty,
/// nothing should be duplicated
std::vector<BinaryBasicBlock *>
moderateCodeToDuplicate(BinaryBasicBlock &BB) const;
std::vector<BinaryBasicBlock *>
aggressiveCodeToDuplicate(BinaryBasicBlock &BB) const;

void runOnFunction(BinaryFunction &Function);

public:
explicit TailDuplication() : BinaryFunctionPass(false) {}

const char *getName() const override { return "tail duplication"; }

void runOnFunctions(BinaryContext &BC) override;
};

} // namespace bolt
} // namespace llvm

#endif
25 changes: 25 additions & 0 deletions bolt/test/X86/tail-duplication-pass.s
@@ -0,0 +1,25 @@
# REQUIRES: system-linux

# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=cache+ -print-finalized \
# RUN: -tail-duplication -tail-duplication-minimum-offset 1 -o %t.out | FileCheck %s

# FDATA: 1 main 2 1 main #.BB2# 0 10
# FDATA: 1 main 4 1 main #.BB2# 0 20
# CHECK: tail duplication possible duplications: 1

.text
.globl main
.type main, %function
.size main, .Lend-main
main:
xor %eax, %eax
jmp .BB2
.BB1:
inc %rax
.BB2:
retq
.Lend:

0 comments on commit 2f46660

Please sign in to comment.