Skip to content

Commit

Permalink
[BOLT] faster cache+ implementation
Browse files Browse the repository at this point in the history
Summary:
Speeding up cache+ algorithm.

The idea is to find and merge "fallthrough" successors before main
optimization. For a pair of blocks, A and B, block B is the fallthrough
successor of A, if (i) all jumps (based on profile) from A goes to B
and (ii) all jumps to B are from A.
Such blocks should be adjacent in an optimal ordering, and should
not be considered for splitting. (This gives the speed up).

The gap between cache and cache+ reduced from ~2m to ~1m.

(cherry picked from FBD6799900)
  • Loading branch information
spupyrev authored and maksfb committed Jan 24, 2018
1 parent 89feb84 commit 626e977
Showing 1 changed file with 88 additions and 6 deletions.
94 changes: 88 additions & 6 deletions bolt/Passes/CachePlusReorderAlgorithm.cpp
Expand Up @@ -147,21 +147,38 @@ class CachePlus {

/// Run cache+ algorithm and return a basic block ordering
std::vector<BinaryBasicBlock *> run() {
// Merge blocks with their fallthrough successors
for (auto BB : BF.layout()) {
if (FallthroughPred[BB->getLayoutIndex()] == nullptr &&
FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
auto CurBB = BB;
while (FallthroughSucc[CurBB->getLayoutIndex()] != nullptr) {
const auto NextBB = FallthroughSucc[CurBB->getLayoutIndex()];
mergeClusters(&AllClusters[BB->getLayoutIndex()],
&AllClusters[NextBB->getLayoutIndex()],
0);
CurBB = NextBB;
}
}
}

// Merge pairs of clusters while there is an improvement in ExtTSP metric
while (Clusters.size() > 1) {
Cluster *BestClusterPred = nullptr;
Cluster *BestClusterSucc = nullptr;
std::pair<double, size_t> BestGain(-1, 0);
for (auto ClusterPred : Clusters) {
// Do not merge cold blocks
if (ClusterPred->isCold())
continue;

// Get candidates for merging with the current cluster
Adjacent.forAllAdjacent(
ClusterPred,
// Find the best candidate
[&](Cluster *ClusterSucc) {
assert(ClusterPred != ClusterSucc && "loop edges are not supported");
// Do not merge cold blocks
if (ClusterPred->isCold() || ClusterSucc->isCold())
return;
assert(!ClusterSucc->isCold() && "cannot merge cold clusters");

// Compute the gain of merging two clusters
auto Gain = mergeGain(ClusterPred, ClusterSucc);
Expand Down Expand Up @@ -261,12 +278,63 @@ class CachePlus {
// Initialize adjacency matrix
Adjacent.initialize(Clusters);
for (auto BB : BF.layout()) {
auto BI = BB->branch_info_begin();
for (auto I : BB->successors()) {
if (BB != I)
if (BB != I && BI->Count > 0) {
Adjacent.set(Clusters[BB->getLayoutIndex()],
Clusters[I->getLayoutIndex()]);
}
++BI;
}
}

// Initialize fallthrough successors
findFallthroughBlocks(InWeight, OutWeight);
}

/// For a pair of blocks, A and B, block B is the fallthrough successor of A,
/// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
/// to B are from A. Such blocks should be adjacent in an optimal ordering,
/// and the method finds such pairs of blocks.
void findFallthroughBlocks(const std::vector<uint64_t> &InWeight,
const std::vector<uint64_t> &OutWeight) {
FallthroughSucc = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
FallthroughPred = std::vector<BinaryBasicBlock *>(BF.size(), nullptr);
// Find fallthroughs based on edge weights
for (auto BB : BF.layout()) {
if (OutWeight[BB->getLayoutIndex()] == 0)
continue;
for (auto Edge : OutEdges[BB->getLayoutIndex()]) {
const auto SuccBB = Edge.first;
// Successor cannot be the first BB, which is pinned
if (OutWeight[BB->getLayoutIndex()] == Edge.second &&
InWeight[SuccBB->getLayoutIndex()] == Edge.second &&
SuccBB->getLayoutIndex() != 0) {
FallthroughSucc[BB->getLayoutIndex()] = SuccBB;
FallthroughPred[SuccBB->getLayoutIndex()] = BB;
break;
}
}
}

// There might be 'cycles' in the fallthrough dependencies (since profile
// data isn't 100% accurate).
// Break the cycles by choosing the block with smallest index as the tail
for (auto BB : BF.layout()) {
const auto Idx = BB->getLayoutIndex();
if (FallthroughSucc[Idx] == nullptr || FallthroughPred[Idx] == nullptr)
continue;

auto SuccBB = FallthroughSucc[Idx];
while (SuccBB != nullptr && SuccBB != BB) {
SuccBB = FallthroughSucc[SuccBB->getLayoutIndex()];
}
if (SuccBB == nullptr)
continue;
// break the cycle
FallthroughSucc[FallthroughPred[Idx]->getLayoutIndex()] = nullptr;
FallthroughPred[Idx] = nullptr;
}
}

/// Compute ExtTSP score for a given order of basic blocks
Expand Down Expand Up @@ -335,10 +403,17 @@ class CachePlus {
};

std::pair<double, size_t> Gain = std::make_pair(-1, 0);
// Try to simply concatenate two clusters
// Try to concatenate two clusters w/o splitting
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, 0);
// Try to split ClusterPred into two and merge with ClusterSucc
for (size_t Offset = 1; Offset < ClusterPred->blocks().size(); Offset++) {
// Make sure the splitting does not break FT successors
auto BB = ClusterPred->blocks()[Offset - 1];
if (FallthroughSucc[BB->getLayoutIndex()] != nullptr) {
assert(FallthroughSucc[BB->getLayoutIndex()] == ClusterPred->blocks()[Offset]);
continue;
}

for (size_t Type = 0; Type < 4; Type++) {
size_t MergeType = 1 + Type + Offset * 4;
Gain = computeMergeGain(Gain, ClusterPred, ClusterSucc, MergeType);
Expand Down Expand Up @@ -400,7 +475,9 @@ class CachePlus {
/// adjacency information, and the corresponding cache.
void mergeClusters(Cluster *Into, Cluster *From, size_t MergeType) {
assert(Into != From && "Cluster cannot be merged with itself");
// Merge the clusters
assert(!Into->isCold() && !From->isCold() && "Merging cold clusters");

// Merge the blocks of clusters
auto MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeType);
Into->merge(From, MergedBlocks, score(MergedBlocks));

Expand Down Expand Up @@ -433,6 +510,11 @@ class CachePlus {
// Cluster adjacency matrix
AdjacencyMatrix<Cluster> Adjacent;

// Fallthrough successor of the block
std::vector<BinaryBasicBlock *> FallthroughSucc;
// Fallthrough predecessor of the block
std::vector<BinaryBasicBlock *> FallthroughPred;

// A cache that keeps precomputed values of mergeGain for pairs of clusters;
// when a pair of clusters (x,y) gets merged, we invalidate the pairs
// containing both x and y and all clusters adjacent to x and y (and recompute
Expand Down

0 comments on commit 626e977

Please sign in to comment.