Skip to content

Commit

Permalink
[PR] Introduce loop inversion pass
Browse files Browse the repository at this point in the history
Summary:
This patch introduces LoopInversionPass. Its main purpose is to ensure
that the loop layout is optimal depending on the profile information. So
if profile information shows that the loop is used, the unconditional
jump instruction must be executed only once and vice-versa. Please take
a look to the pass header file and test for more details.

Also change link_fdata script a bit, to be able to change FDATA prefix,
like FileCheck does.

Vladislav Khmelevsky,
Advanced Software Technology Lab, Huawei

PR facebookarchive/BOLT#153

(cherry picked from FBD28391811)
  • Loading branch information
yota9 authored and maksfb committed May 11, 2021
1 parent 12e9fec commit 79807d9
Show file tree
Hide file tree
Showing 7 changed files with 234 additions and 36 deletions.
3 changes: 3 additions & 0 deletions bolt/src/BinaryPassManager.cpp
Expand Up @@ -18,6 +18,7 @@
#include "Passes/Instrumentation.h"
#include "Passes/JTFootprintReduction.h"
#include "Passes/LongJmp.h"
#include "Passes/LoopInversionPass.h"
#include "Passes/PLTCall.h"
#include "Passes/PatchEntries.h"
#include "Passes/RegReAssign.h"
Expand Down Expand Up @@ -446,6 +447,8 @@ void BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {

Manager.registerPass(std::make_unique<SplitFunctions>(PrintSplit));

Manager.registerPass(std::make_unique<LoopInversionPass>());

// This pass syncs local branches with CFG. If any of the following
// passes breaks the sync - they either need to re-run the pass or
// fix branches consistency internally.
Expand Down
58 changes: 24 additions & 34 deletions bolt/src/Passes/BinaryPasses.cpp
Expand Up @@ -8,7 +8,6 @@
//
//===----------------------------------------------------------------------===//

#include "BinaryFunction.h"
#include "BinaryPasses.h"
#include "ParallelUtilities.h"
#include "Passes/ReorderAlgorithm.h"
Expand Down Expand Up @@ -150,39 +149,30 @@ PrintUnknownCFG("print-unknown-cfg",
cl::cat(BoltCategory),
cl::ReallyHidden);

static cl::opt<bolt::ReorderBasicBlocks::LayoutType>
ReorderBlocks("reorder-blocks",
cl::desc("change layout of basic blocks in a function"),
cl::init(bolt::ReorderBasicBlocks::LT_NONE),
cl::values(
clEnumValN(bolt::ReorderBasicBlocks::LT_NONE,
"none",
"do not reorder basic blocks"),
clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE,
"reverse",
"layout blocks in reverse order"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE,
"normal",
"perform optimal layout based on profile"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
"branch-predictor",
"perform optimal layout prioritizing branch "
"predictions"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE,
"cache",
"perform optimal layout prioritizing I-cache "
"behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP,
"cache+",
"perform layout optimizing I-cache behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP,
"ext-tsp",
"perform layout optimizing I-cache behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
"cluster-shuffle",
"perform random layout of clusters")),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));
cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
"reorder-blocks", cl::desc("change layout of basic blocks in a function"),
cl::init(bolt::ReorderBasicBlocks::LT_NONE),
cl::values(
clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
"do not reorder basic blocks"),
clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
"layout blocks in reverse order"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
"perform optimal layout based on profile"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
"branch-predictor",
"perform optimal layout prioritizing branch "
"predictions"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
"perform optimal layout prioritizing I-cache "
"behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "cache+",
"perform layout optimizing I-cache behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
"perform layout optimizing I-cache behavior"),
clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
"cluster-shuffle", "perform random layout of clusters")),
cl::ZeroOrMore, cl::cat(BoltOptCategory));

cl::opt<unsigned>
ExecutionCountThreshold("execution-count-threshold",
Expand Down
3 changes: 2 additions & 1 deletion bolt/src/Passes/CMakeLists.txt
Expand Up @@ -17,8 +17,9 @@ add_llvm_library(LLVMBOLTPasses
Inliner.cpp
Instrumentation.cpp
JTFootprintReduction.cpp
LivenessAnalysis.cpp
LongJmp.cpp
LoopInversionPass.cpp
LivenessAnalysis.cpp
MCF.cpp
PatchEntries.cpp
PettisAndHansen.cpp
Expand Down
103 changes: 103 additions & 0 deletions bolt/src/Passes/LoopInversionPass.cpp
@@ -0,0 +1,103 @@
//===--------- Passes/LoopInversionPass.cpp -------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#include "LoopInversionPass.h"
#include "ParallelUtilities.h"

using namespace llvm;

namespace opts {
extern cl::OptionCategory BoltCategory;

extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;

static cl::opt<bool> LoopReorder(
"loop-inversion-opt",
cl::desc("reorder unconditional jump instructions in loops optimization"),
cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
} // namespace opts

namespace llvm {
namespace bolt {

bool LoopInversionPass::runOnFunction(BinaryFunction &BF) {
bool IsChanged = false;
if (BF.layout_size() < 3 || !BF.hasValidProfile())
return false;

BF.updateLayoutIndices();
for (BinaryBasicBlock *BB : BF.layout()) {
if (BB->succ_size() != 1 || BB->pred_size() != 1)
continue;

BinaryBasicBlock *SuccBB = *BB->succ_begin();
BinaryBasicBlock *PredBB = *BB->pred_begin();
const unsigned BBIndex = BB->getLayoutIndex();
const unsigned SuccBBIndex = SuccBB->getLayoutIndex();
if (SuccBB == PredBB && BB != SuccBB && BBIndex != 0 && SuccBBIndex != 0 &&
SuccBB->succ_size() == 2 && BB->isCold() == SuccBB->isCold()) {
// Get the second successor (after loop BB)
BinaryBasicBlock *SecondSucc = nullptr;
for (BinaryBasicBlock *Succ : SuccBB->successors()) {
if (Succ != &*BB) {
SecondSucc = Succ;
break;
}
}

assert(SecondSucc != nullptr && "Unable to find second BB successor");
const uint64_t BBCount = SuccBB->getBranchInfo(*BB).Count;
const uint64_t OtherCount = SuccBB->getBranchInfo(*SecondSucc).Count;
if ((BBCount < OtherCount) && (BBIndex > SuccBBIndex))
continue;

IsChanged = true;
BB->setLayoutIndex(SuccBBIndex);
SuccBB->setLayoutIndex(BBIndex);
}
}

if (IsChanged) {
BinaryFunction::BasicBlockOrderType NewOrder = BF.getLayout();
std::sort(NewOrder.begin(), NewOrder.end(),
[&](BinaryBasicBlock *BB1, BinaryBasicBlock *BB2) {
return BB1->getLayoutIndex() < BB2->getLayoutIndex();
});
BF.updateBasicBlockLayout(NewOrder);
}

return IsChanged;
}

void LoopInversionPass::runOnFunctions(BinaryContext &BC) {
std::atomic<uint64_t> ModifiedFuncCount{0};
if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE ||
opts::LoopReorder == false)
return;

ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
if (runOnFunction(BF))
++ModifiedFuncCount;
};

ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !shouldOptimize(BF);
};

ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, SkipFunc,
"LoopInversionPass");

outs() << "BOLT-INFO: " << ModifiedFuncCount
<< " Functions were reordered by LoopInversionPass\n";
}

} // end namespace bolt
} // end namespace llvm
61 changes: 61 additions & 0 deletions bolt/src/Passes/LoopInversionPass.h
@@ -0,0 +1,61 @@
//===--------- Passes/LoopInversionPass.h ---------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_TOOLS_LLVM_BOLT_PASSES_LOOPINVERSION_H
#define LLVM_TOOLS_LLVM_BOLT_PASSES_LOOPINVERSION_H

#include "BinaryPasses.h"

// This pass founds cases when BBs have layout:
// #BB0:
// ....
// #BB1:
// cmp
// cond_jmp #BB3
// #BB2:
// <loop body>
// jmp #BB1
// #BB3:
// <loop exit>
//
// And swaps BB1 and BB2:
// #BB0:
// ....
// jmp #BB1
// #BB2:
// <loop body>
// #BB1:
// cmp
// cond_njmp #BB2
// #BB3:
// <loop exit>
//
// And vice versa depending on the profile information.
// The advantage is that the loop uses only one conditional jump,
// the unconditional jump is only used once on the loop start.

namespace llvm {
namespace bolt {

class LoopInversionPass : public BinaryFunctionPass {
public:
explicit LoopInversionPass() : BinaryFunctionPass(false) {}

const char *getName() const override { return "loop-inversion-opt"; }

/// Pass entry point
void runOnFunctions(BinaryContext &BC) override;
bool runOnFunction(BinaryFunction &Function);
};

} // namespace bolt
} // namespace llvm

#endif
38 changes: 38 additions & 0 deletions bolt/test/X86/loop-inversion-pass.s
@@ -0,0 +1,38 @@
# REQUIRES: system-linux

# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \
# RUN: %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
# RUN: link_fdata %s %t.o %t.fdata2 "FDATA2"
# RUN: %host_cc %cflags %t.o -o %t.exe -Wl,-q
# RUN: llvm-bolt %t.exe -data %t.fdata -reorder-blocks=cache+ -print-finalized \
# RUN: -loop-inversion-opt -o %t.out | FileCheck %s
# RUN: llvm-bolt %t.exe -data %t.fdata2 -reorder-blocks=cache+ -print-finalized \
# RUN: -loop-inversion-opt -o %t.out2 | FileCheck --check-prefix="CHECK2" %s

# The case where loop is used:
# FDATA: 1 main 2 1 main #.J1# 0 420
# FDATA: 1 main b 1 main #.Jloop# 0 420
# FDATA: 1 main b 1 main d 0 1
# CHECK: BB Layout : .LBB00, .Ltmp0, .Ltmp1, .LFT0

# The case where loop is unused:
# FDATA2: 1 main 2 1 main #.J1# 0 420
# FDATA2: 1 main b 1 main #.Jloop# 0 1
# FDATA2: 1 main b 1 main d 0 420
# CHECK2: BB Layout : .LBB00, .Ltmp1, .LFT0, .Ltmp0

.text
.globl main
.type main, %function
.size main, .Lend-main
main:
xor %eax, %eax
jmp .J1
.Jloop:
inc %rax
.J1:
cmp $16, %rax
jl .Jloop
retq
.Lend:
4 changes: 3 additions & 1 deletion bolt/test/link_fdata.sh
@@ -1,6 +1,8 @@
#!/bin/bash -e

grep -e '^# FDATA:' < "$1" | cut -c10- > "$3"
prefix=${4:-"FDATA"}

grep -e "^# ${prefix}:" < "$1" | sed -E "s/# ${prefix}: //g" > "$3"
mapfile -t symbols < <(nm --defined-only "$2")

for line in "${symbols[@]}"; do
Expand Down

0 comments on commit 79807d9

Please sign in to comment.