Skip to content

Commit

Permalink
[ELF] Add profile guided section layout
Browse files Browse the repository at this point in the history
This adds profile guided layout using the Call-Chain Clustering (C³) heuristic
from https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf .

RFC: [llvm-dev] [RFC] Profile guided section layout
     http://lists.llvm.org/pipermail/llvm-dev/2017-June/114178.html

Pass `--call-graph-ordering-file <file>` to read a call graph profile where each
line has the format:

    <from symbol> <to symbol> <call count>

Differential Revision: https://reviews.llvm.org/D36351

llvm-svn: 330234
  • Loading branch information
Bigcheese committed Apr 17, 2018
1 parent 66f1c9a commit b842725
Show file tree
Hide file tree
Showing 13 changed files with 693 additions and 19 deletions.
1 change: 1 addition & 0 deletions lld/ELF/CMakeLists.txt
Expand Up @@ -19,6 +19,7 @@ add_lld_library(lldELF
Arch/SPARCV9.cpp
Arch/X86.cpp
Arch/X86_64.cpp
CallGraphSort.cpp
Driver.cpp
DriverUtils.cpp
EhFrame.cpp
Expand Down
251 changes: 251 additions & 0 deletions lld/ELF/CallGraphSort.cpp
@@ -0,0 +1,251 @@
//===- CallGraphSort.cpp --------------------------------------------------===//
//
// The LLVM Linker
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
/// for Large-Scale Data-Center Applications
/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
///
/// The goal of this algorithm is to improve runtime performance of the final
/// executable by arranging code sections such that page table and i-cache
/// misses are minimized.
///
/// Definitions:
/// * Cluster
/// * An ordered list of input sections which are layed out as a unit. At the
/// beginning of the algorithm each input section has its own cluster and
/// the weight of the cluster is the sum of the weight of all incomming
/// edges.
/// * Call-Chain Clustering (C³) Heuristic
/// * Defines when and how clusters are combined. Pick the highest weighted
/// input section then add it to its most likely predecessor if it wouldn't
/// penalize it too much.
/// * Density
/// * The weight of the cluster divided by the size of the cluster. This is a
/// proxy for the ammount of execution time spent per byte of the cluster.
///
/// It does so given a call graph profile by the following:
/// * Build a weighted call graph from the call graph profile
/// * Sort input sections by weight
/// * For each input section starting with the highest weight
/// * Find its most likely predecessor cluster
/// * Check if the combined cluster would be too large, or would have too low
/// a density.
/// * If not, then combine the clusters.
/// * Sort non-empty clusters by density
///
//===----------------------------------------------------------------------===//

#include "CallGraphSort.h"
#include "OutputSections.h"
#include "SymbolTable.h"
#include "Symbols.h"

using namespace llvm;
using namespace lld;
using namespace lld::elf;

namespace {
struct Edge {
int From;
uint64_t Weight;
};

struct Cluster {
Cluster(int Sec, size_t S) {
Sections.push_back(Sec);
Size = S;
}

double getDensity() const {
if (Size == 0)
return 0;
return double(Weight) / double(Size);
}

std::vector<int> Sections;
size_t Size = 0;
uint64_t Weight = 0;
uint64_t InitialWeight = 0;
std::vector<Edge> Preds;
};

class CallGraphSort {
public:
CallGraphSort();

DenseMap<const InputSectionBase *, int> run();

private:
std::vector<Cluster> Clusters;
std::vector<const InputSectionBase *> Sections;

void groupClusters();
};

// Maximum ammount the combined cluster density can be worse than the original
// cluster to consider merging.
constexpr int MAX_DENSITY_DEGRADATION = 8;

// Maximum cluster size in bytes.
constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
} // end anonymous namespace

// Take the edge list in Config->CallGraphProfile, resolve symbol names to
// Symbols, and generate a graph between InputSections with the provided
// weights.
CallGraphSort::CallGraphSort() {
llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
uint64_t> &Profile = Config->CallGraphProfile;
DenseMap<const InputSectionBase *, int> SecToCluster;

auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int {
auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size()));
if (Res.second) {
Sections.push_back(IS);
Clusters.emplace_back(Clusters.size(), IS->getSize());
}
return Res.first->second;
};

// Create the graph.
for (const auto &C : Profile) {
const auto *FromSB = cast<InputSectionBase>(C.first.first->Repl);
const auto *ToSB = cast<InputSectionBase>(C.first.second->Repl);
uint64_t Weight = C.second;

// Ignore edges between input sections belonging to different output
// sections. This is done because otherwise we would end up with clusters
// containing input sections that can't actually be placed adjacently in the
// output. This messes with the cluster size and density calculations. We
// would also end up moving input sections in other output sections without
// moving them closer to what calls them.
if (FromSB->getOutputSection() != ToSB->getOutputSection())
continue;

int From = GetOrCreateNode(FromSB);
int To = GetOrCreateNode(ToSB);

Clusters[To].Weight += Weight;

if (From == To)
continue;

// Add an edge
Clusters[To].Preds.push_back({From, Weight});
}
for (Cluster &C : Clusters)
C.InitialWeight = C.Weight;
}

// It's bad to merge clusters which would degrade the density too much.
static bool isNewDensityBad(Cluster &A, Cluster &B) {
double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size);
if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION)
return true;
return false;
}

static void mergeClusters(Cluster &Into, Cluster &From) {
Into.Sections.insert(Into.Sections.end(), From.Sections.begin(),
From.Sections.end());
Into.Size += From.Size;
Into.Weight += From.Weight;
From.Sections.clear();
From.Size = 0;
From.Weight = 0;
}

// Group InputSections into clusters using the Call-Chain Clustering heuristic
// then sort the clusters by density.
void CallGraphSort::groupClusters() {
std::vector<int> SortedSecs(Clusters.size());
std::vector<Cluster *> SecToCluster(Clusters.size());

for (int SI = 0, SE = Clusters.size(); SI != SE; ++SI) {
SortedSecs[SI] = SI;
SecToCluster[SI] = &Clusters[SI];
}

std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) {
return Clusters[B].getDensity() < Clusters[A].getDensity();
});

for (int SI : SortedSecs) {
// Clusters[SI] is the same as SecToClusters[SI] here because it has not
// been merged into another cluster yet.
Cluster &C = Clusters[SI];

int BestPred = -1;
uint64_t BestWeight = 0;

for (Edge &E : C.Preds) {
if (BestPred == -1 || E.Weight > BestWeight) {
BestPred = E.From;
BestWeight = E.Weight;
}
}

// don't consider merging if the edge is unlikely.
if (BestWeight * 10 <= C.InitialWeight)
continue;

Cluster *PredC = SecToCluster[BestPred];
if (PredC == &C)
continue;

if (C.Size + PredC->Size > MAX_CLUSTER_SIZE)
continue;

if (isNewDensityBad(*PredC, C))
continue;

// NOTE: Consider using a disjoint-set to track section -> cluster mapping
// if this is ever slow.
for (int SI : C.Sections)
SecToCluster[SI] = PredC;

mergeClusters(*PredC, C);
}

// Remove empty or dead nodes. Invalidates all cluster indices.
Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(),
[](const Cluster &C) {
return C.Size == 0 || C.Sections.empty();
}),
Clusters.end());

// Sort by density.
std::sort(Clusters.begin(), Clusters.end(),
[](const Cluster &A, const Cluster &B) {
return A.getDensity() > B.getDensity();
});
}

DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
groupClusters();

// Generate order.
llvm::DenseMap<const InputSectionBase *, int> OrderMap;
ssize_t CurOrder = 1;

for (const Cluster &C : Clusters)
for (int SecIndex : C.Sections)
OrderMap[Sections[SecIndex]] = CurOrder++;

return OrderMap;
}

// Sort sections by the profile data provided by -callgraph-profile-file
//
// This first builds a call graph based on the profile data then merges sections
// according to the C³ huristic. All clusters are then sorted by a density
// metric to further improve locality.
DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
return CallGraphSort().run();
}
23 changes: 23 additions & 0 deletions lld/ELF/CallGraphSort.h
@@ -0,0 +1,23 @@
//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
//
// The LLVM Linker
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#ifndef LLD_ELF_CALL_GRAPH_SORT_H
#define LLD_ELF_CALL_GRAPH_SORT_H

#include "llvm/ADT/DenseMap.h"

namespace lld {
namespace elf {
class InputSectionBase;

llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
} // namespace elf
} // namespace lld

#endif
4 changes: 4 additions & 0 deletions lld/ELF/Config.h
Expand Up @@ -24,6 +24,7 @@ namespace lld {
namespace elf {

class InputFile;
class InputSectionBase;

enum ELFKind {
ELFNoneKind,
Expand Down Expand Up @@ -104,6 +105,9 @@ struct Configuration {
std::vector<SymbolVersion> VersionScriptGlobals;
std::vector<SymbolVersion> VersionScriptLocals;
std::vector<uint8_t> BuildIdVector;
llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
uint64_t>
CallGraphProfile;
bool AllowMultipleDefinition;
bool AndroidPackDynRelocs = false;
bool ARMHasBlx = false;
Expand Down
44 changes: 44 additions & 0 deletions lld/ELF/Driver.cpp
Expand Up @@ -572,6 +572,45 @@ getBuildId(opt::InputArgList &Args) {
return {BuildIdKind::None, {}};
}

static void readCallGraph(MemoryBufferRef MB) {
// Build a map from symbol name to section
DenseMap<StringRef, const Symbol *> SymbolNameToSymbol;
for (InputFile *File : ObjectFiles)
for (Symbol *Sym : File->getSymbols())
SymbolNameToSymbol[Sym->getName()] = Sym;

for (StringRef L : args::getLines(MB)) {
SmallVector<StringRef, 3> Fields;
L.split(Fields, ' ');
if (Fields.size() != 3)
fatal("parse error");
uint64_t Count;
if (!to_integer(Fields[2], Count))
fatal("parse error");
const Symbol *FromSym = SymbolNameToSymbol.lookup(Fields[0]);
const Symbol *ToSym = SymbolNameToSymbol.lookup(Fields[1]);
if (Config->WarnSymbolOrdering) {
if (!FromSym)
warn("call graph file: no such symbol: " + Fields[0]);
if (!ToSym)
warn("call graph file: no such symbol: " + Fields[1]);
}
if (!FromSym || !ToSym || Count == 0)
continue;
warnUnorderableSymbol(FromSym);
warnUnorderableSymbol(ToSym);
const Defined *FromSymD = dyn_cast<Defined>(FromSym);
const Defined *ToSymD = dyn_cast<Defined>(ToSym);
if (!FromSymD || !ToSymD)
continue;
const auto *FromSB = dyn_cast_or_null<InputSectionBase>(FromSymD->Section);
const auto *ToSB = dyn_cast_or_null<InputSectionBase>(ToSymD->Section);
if (!FromSB || !ToSB)
continue;
Config->CallGraphProfile[std::make_pair(FromSB, ToSB)] += Count;
}
}

static bool getCompressDebugSections(opt::InputArgList &Args) {
StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
if (S == "none")
Expand Down Expand Up @@ -1224,6 +1263,11 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &Args) {
if (Config->ICF)
doIcf<ELFT>();

// Read the callgraph now that we know what was gced or icfed
if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file))
if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
readCallGraph(*Buffer);

// Write the result to the file.
writeResult<ELFT>();
}
3 changes: 3 additions & 0 deletions lld/ELF/Options.td
Expand Up @@ -66,6 +66,9 @@ defm as_needed: B<"as-needed",
"Only set DT_NEEDED for shared libraries if used",
"Always set DT_NEEDED for shared libraries">;

defm call_graph_ordering_file: Eq<"call-graph-ordering-file">,
HelpText<"Layout sections to optimize the given callgraph">;

// -chroot doesn't have a help text because it is an internal option.
defm chroot: Eq<"chroot">;

Expand Down
21 changes: 21 additions & 0 deletions lld/ELF/Symbols.cpp
Expand Up @@ -248,6 +248,27 @@ void elf::printTraceSymbol(Symbol *Sym) {
message(toString(Sym->File) + S + Sym->getName());
}

void elf::warnUnorderableSymbol(const Symbol *Sym) {
if (!Config->WarnSymbolOrdering)
return;
const InputFile *File = Sym->File;
auto *D = dyn_cast<Defined>(Sym);
if (Sym->isUndefined())
warn(toString(File) +
": unable to order undefined symbol: " + Sym->getName());
else if (Sym->isShared())
warn(toString(File) + ": unable to order shared symbol: " + Sym->getName());
else if (D && !D->Section)
warn(toString(File) +
": unable to order absolute symbol: " + Sym->getName());
else if (D && isa<OutputSection>(D->Section))
warn(toString(File) +
": unable to order synthetic symbol: " + Sym->getName());
else if (D && !D->Section->Repl->Live)
warn(toString(File) +
": unable to order discarded symbol: " + Sym->getName());
}

// Returns a symbol for an error message.
std::string lld::toString(const Symbol &B) {
if (Config->Demangle)
Expand Down

0 comments on commit b842725

Please sign in to comment.