Skip to content

Commit

Permalink
Cache+ speed, reduce mallocs
Browse files Browse the repository at this point in the history
Summary:
Speed of cache+ by skipping mallocs on vectors.

Although this change speeds up the algorithm by 2x, this is still not
enough for some binaries where some functions have ~2500 hot basic
blocks. Hence, introduce a threshold for expensive optimizations in
CachePlusReorderAlgorithm. If the number of hot basic blocks exceeds
the threshold (2048 by default), we use a cheaper version, which is
quite fast.

(cherry picked from FBD6928075)
  • Loading branch information
Andy Newell authored and maksfb committed Feb 9, 2018
1 parent 5599c01 commit e156230
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 68 deletions.
2 changes: 2 additions & 0 deletions bolt/CacheMetrics.cpp
Expand Up @@ -116,6 +116,8 @@ double calcExtTSPScore(

double Score = 0.0;
for (auto BF : BinaryFunctions) {
if (!BF->hasProfile())
continue;
for (auto SrcBB : BF->layout()) {
auto BI = SrcBB->branch_info_begin();
for (auto DstBB : SrcBB->successors()) {
Expand Down
207 changes: 139 additions & 68 deletions bolt/Passes/CachePlusReorderAlgorithm.cpp
Expand Up @@ -14,11 +14,25 @@
#include "CacheMetrics.h"
#include "ReorderAlgorithm.h"
#include "ReorderUtils.h"
#include "llvm/Support/Options.h"

using namespace llvm;
using namespace bolt;
using EdgeList = std::vector<std::pair<BinaryBasicBlock *, uint64_t>>;

namespace opts {

extern cl::OptionCategory BoltOptCategory;

cl::opt<unsigned>
ClusterSplitThreshold("cluster-split-threshold",
cl::desc("The maximum size of a function to apply splitting of clusters"),
cl::init(2048),
cl::ZeroOrMore,
cl::cat(BoltOptCategory));

}

namespace llvm {
namespace bolt {

Expand Down Expand Up @@ -88,6 +102,59 @@ class Cluster {
double Score;
};

using ClusterIter = std::vector<BinaryBasicBlock *>::const_iterator;

// A wrapper around three clusters of basic blocks; it is used to avoid extra
// instantiation of the vectors.
class MergedCluster {
public:
MergedCluster(ClusterIter Begin1,
ClusterIter End1,
ClusterIter Begin2,
ClusterIter End2,
ClusterIter Begin3,
ClusterIter End3)
: Begin1(Begin1),
End1(End1),
Begin2(Begin2),
End2(End2),
Begin3(Begin3),
End3(End3) {}

template<typename F>
void forEach(const F &Func) const {
for (auto It = Begin1; It != End1; It++)
Func(*It);
for (auto It = Begin2; It != End2; It++)
Func(*It);
for (auto It = Begin3; It != End3; It++)
Func(*It);
}

std::vector<BinaryBasicBlock *> getBlocks() const {
std::vector<BinaryBasicBlock *> Result;
Result.reserve(std::distance(Begin1, End1) +
std::distance(Begin2, End2) +
std::distance(Begin3, End3));
Result.insert(Result.end(), Begin1, End1);
Result.insert(Result.end(), Begin2, End2);
Result.insert(Result.end(), Begin3, End3);
return Result;
}

const BinaryBasicBlock *getFirstBlock() const {
return *Begin1;
}

private:
ClusterIter Begin1;
ClusterIter End1;
ClusterIter Begin2;
ClusterIter End2;
ClusterIter Begin3;
ClusterIter End3;
};

/// Deterministically compare clusters by their density in decreasing order
bool compareClusters(const Cluster *C1, const Cluster *C2) {
// original entry point to the front
Expand Down Expand Up @@ -140,8 +207,11 @@ bool compareClusterPairs(const Cluster *A1, const Cluster *B1,
/// while keeping the implementation sufficiently fast.
class CachePlus {
public:
CachePlus(const BinaryFunction &BF)
: BF(BF), Adjacent(BF.layout_size()), Cache(BF.layout_size()) {
CachePlus(const BinaryFunction &BF, bool UseClusterSplitting)
: BF(BF),
UseClusterSplitting(UseClusterSplitting),
Adjacent(BF.layout_size()),
Cache(BF.layout_size()) {
initialize();
}

Expand Down Expand Up @@ -338,31 +408,37 @@ class CachePlus {
}

/// Compute ExtTSP score for a given order of basic blocks
double score(const std::vector<BinaryBasicBlock *>& Blocks) const {
double score(const MergedCluster& MergedBlocks) const {
uint64_t NotSet = static_cast<uint64_t>(-1);
auto Addr = std::vector<uint64_t>(BF.layout_size(), NotSet);
EstimatedAddr.assign(BF.layout_size(), NotSet);

uint64_t CurAddr = 0;
for (auto BB : Blocks) {
size_t Index = BB->getLayoutIndex();
Addr[Index] = CurAddr;
CurAddr += Size[Index];
}
MergedBlocks.forEach(
[&](const BinaryBasicBlock *BB) {
size_t Index = BB->getLayoutIndex();
EstimatedAddr[Index] = CurAddr;
CurAddr += Size[Index];
}
);

double Score = 0;
for (auto BB : Blocks) {
size_t Index = BB->getLayoutIndex();
for (auto Edge : OutEdges[Index]) {
auto SuccBB = Edge.first;
size_t SuccIndex = SuccBB->getLayoutIndex();

if (Addr[SuccBB->getLayoutIndex()] != NotSet) {
Score += CacheMetrics::extTSPScore(Addr[Index],
Size[Index],
Addr[SuccIndex],
Edge.second);
MergedBlocks.forEach(
[&](const BinaryBasicBlock *BB) {
size_t Index = BB->getLayoutIndex();
for (auto Edge : OutEdges[Index]) {
auto SuccBB = Edge.first;
size_t SuccIndex = SuccBB->getLayoutIndex();

if (EstimatedAddr[SuccIndex] != NotSet) {
Score += CacheMetrics::extTSPScore(EstimatedAddr[Index],
Size[Index],
EstimatedAddr[SuccIndex],
Edge.second);
}
}
}
}
);

return Score;
}

Expand Down Expand Up @@ -391,7 +467,7 @@ class CachePlus {
MergeType);
// Does the new cluster preserve the original entry point?
if ((ClusterPred->isEntryPoint() || ClusterSucc->isEntryPoint()) &&
MergedBlocks[0]->getLayoutIndex() != 0)
MergedBlocks.getFirstBlock()->getLayoutIndex() != 0)
return CurGain;

// The score of the new cluster
Expand All @@ -405,18 +481,20 @@ class CachePlus {
std::pair<double, size_t> Gain = std::make_pair(-1, 0);
// Try to concatenate two clusters w/o splitting
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
// Try to split ClusterPred into two and merge with ClusterSucc
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
// Make sure the splitting does not break FT successors
auto BB = ClusterPred->blocks()[Offset - 1];
if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]);
continue;
}
if (UseClusterSplitting) {
// Try to split ClusterPred into two and merge with ClusterSucc
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
// Make sure the splitting does not break FT successors
auto BB = ClusterPred->blocks()[Offset - 1];
if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]);
continue;
}

for (size_t Type = 0; Type < 4; Type++) {
size_t MergeType = 1 + Type + Offset * 4;
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
for (size_t Type = 0; Type < 4; Type++) {
size_t MergeType = 1 + Type + Offset * 4;
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
}
}
}

Expand All @@ -426,29 +504,16 @@ class CachePlus {

/// Merge two clusters (orders) of blocks according to a given 'merge type'.
///
/// If MergeType == 0, then the results is a concatentation of two clusters.
/// If MergeType == 0, then the result is a concatentation of two clusters.
/// Otherwise, the first cluster is cut into two and we consider all possible
/// ways of concatenating three clusters.
std::vector<BinaryBasicBlock *> mergeBlocks(
const std::vector<BinaryBasicBlock *> &X,
const std::vector<BinaryBasicBlock *> &Y,
size_t MergeType
) const {
// Concatenate three clusters of blocks in the given order
auto concat = [&](const std::vector<BinaryBasicBlock *> &A,
const std::vector<BinaryBasicBlock *> &B,
const std::vector<BinaryBasicBlock *> &C) {
std::vector<BinaryBasicBlock *> Result;
Result.reserve(A.size() + B.size() + C.size());
Result.insert(Result.end(), A.begin(), A.end());
Result.insert(Result.end(), B.begin(), B.end());
Result.insert(Result.end(), C.begin(), C.end());
return Result;
};

MergedCluster mergeBlocks(const std::vector<BinaryBasicBlock *> &X,
const std::vector<BinaryBasicBlock *> &Y,
size_t MergeType) const {
// Merging w/o splitting existing clusters
if (MergeType == 0) {
return concat(X, Y, std::vector<BinaryBasicBlock *>());
ClusterIter Empty;
return MergedCluster(X.begin(), X.end(), Y.begin(), Y.end(), Empty, Empty);
}

MergeType--;
Expand All @@ -457,15 +522,19 @@ class CachePlus {
assert(0 < Offset && Offset < X.size() &&
"Invalid offset while merging clusters");
// Split the first cluster, X, into X1 and X2
std::vector<BinaryBasicBlock *> X1(X.begin(), X.begin() + Offset);
std::vector<BinaryBasicBlock *> X2(X.begin() + Offset, X.end());
ClusterIter BeginX1 = X.begin();
ClusterIter EndX1 = X.begin() + Offset;
ClusterIter BeginX2 = X.begin() + Offset;
ClusterIter EndX2 = X.end();
ClusterIter BeginY = Y.begin();
ClusterIter EndY = Y.end();

// Construct a new cluster from three existing ones
switch(Type) {
case 0: return concat(X1, Y, X2);
case 1: return concat(Y, X2, X1);
case 2: return concat(X2, Y, X1);
case 3: return concat(X2, X1, Y);
case 0: return MergedCluster(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
case 1: return MergedCluster(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
case 2: return MergedCluster(BeginX2, EndX2, BeginY, EndY, BeginX1, EndX1);
case 3: return MergedCluster(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
default:
llvm_unreachable("unexpected merge type");
}
Expand All @@ -479,7 +548,7 @@ class CachePlus {

// Merge the blocks of clusters
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
Into->merge(From, MergedBlocks, score(MergedBlocks));
Into->merge(From, MergedBlocks.getBlocks(), score(MergedBlocks));

// Remove cluster From from the list of active clusters
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
Expand All @@ -495,6 +564,9 @@ class CachePlus {
// The binary function
const BinaryFunction &BF;

// Indicates whether to use cluster splitting for optimization
bool UseClusterSplitting;

// All clusters
std::vector<Cluster> AllClusters;

Expand All @@ -520,6 +592,9 @@ class CachePlus {
// containing both x and y and all clusters adjacent to x and y (and recompute
// them on the next iteration).
mutable ClusterPairCache<Cluster, std::pair<double, size_t>> Cache;

// A reusable vector used within score() method
mutable std::vector<uint64_t> EstimatedAddr;
};

void CachePlusReorderAlgorithm::reorderBasicBlocks(
Expand All @@ -528,26 +603,22 @@ void CachePlusReorderAlgorithm::reorderBasicBlocks(
return;

// Are there jumps with positive execution count?
uint64_t SumCount = 0;
size_t NumHotBlocks = 0;
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
for (auto I : BB->successors()) {
assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && I != nullptr);
SumCount += BI->Count;
++BI;
}
if (BB->getKnownExecutionCount() > 0)
NumHotBlocks++;
}

// Do not change layout of functions w/o profile information
if (SumCount == 0) {
if (NumHotBlocks == 0) {
for (auto BB : BF.layout()) {
Order.push_back(BB);
}
return;
}

// Apply the algorithm
Order = CachePlus(BF).run();
Order = CachePlus(BF, NumHotBlocks <= opts::ClusterSplitThreshold).run();

// Verify correctness
assert(Order[0]->isEntryPoint() && "Original entry point is not preserved");
Expand Down

0 comments on commit e156230

Please sign in to comment.