Skip to content

Commit

Permalink
[CSSPGO] Sort function offset table to speed up profile loading.
Browse files Browse the repository at this point in the history
With the context split work, the context-based (an array of strings) sorting performed at profile load time is way more expansive than single-string-based sorting. This is likely due to auxiliary operations done on each array element, such as indirect references, std::min operations, also likely cache misses. In this change I'm presorting profiles during profile generation time to avoid sorting at compile time.

Compared to the previous context-split work, this effectively cuts down compile time by 20% for one of our large services and brings us closer to non-CS build, with still a small gap in build time.

Reviewed By: wenlei, wmi

Differential Revision: https://reviews.llvm.org/D109036
  • Loading branch information
htyu committed Sep 1, 2021
1 parent 02f74ea commit f4711e0
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 33 deletions.
9 changes: 9 additions & 0 deletions llvm/include/llvm/ProfileData/SampleProf.h
Expand Up @@ -208,6 +208,13 @@ enum class SecFuncMetadataFlags : uint32_t {
SecFlagHasAttribute = (1 << 1)
};

enum class SecFuncOffsetFlags : uint32_t {
SecFlagInvalid = 0,
// Store function offsets in an order of contexts. The order ensures that
// callee contexts of a given context laid out next to it.
SecFlagOrdered = (1 << 0),
};

// Verify section specific flag is used for the correct section.
template <class SecFlagType>
static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
Expand All @@ -228,6 +235,8 @@ static inline void verifySecFlag(SecType Type, SecFlagType Flag) {
IsFlagLegal = std::is_same<SecFuncMetadataFlags, SecFlagType>();
break;
default:
case SecFuncOffsetTable:
IsFlagLegal = std::is_same<SecFuncOffsetFlags, SecFlagType>();
break;
}
if (!IsFlagLegal)
Expand Down
7 changes: 7 additions & 0 deletions llvm/include/llvm/ProfileData/SampleProfReader.h
Expand Up @@ -720,6 +720,11 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
/// The table mapping from function context to the offset of its
/// FunctionSample towards file start.
DenseMap<SampleContext, uint64_t> FuncOffsetTable;

/// Function offset mapping ordered by contexts.
std::unique_ptr<std::vector<std::pair<SampleContext, uint64_t>>>
OrderedFuncOffsets;

/// The set containing the functions to use when compiling a module.
DenseSet<StringRef> FuncsToUse;

Expand All @@ -746,6 +751,8 @@ class SampleProfileReaderExtBinaryBase : public SampleProfileReaderBinary {
/// SecFlagFlat flag.
bool SkipFlatProf = false;

bool FuncOffsetsOrdered = false;

public:
SampleProfileReaderExtBinaryBase(std::unique_ptr<MemoryBuffer> B,
LLVMContext &C, SampleProfileFormat Format)
Expand Down
76 changes: 46 additions & 30 deletions llvm/lib/ProfileData/SampleProfReader.cpp
Expand Up @@ -675,6 +675,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
return EC;
break;
case SecFuncOffsetTable:
FuncOffsetsOrdered = hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered);
if (std::error_code EC = readFuncOffsetTable())
return EC;
break;
Expand Down Expand Up @@ -720,17 +721,27 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncOffsetTable() {
return EC;

FuncOffsetTable.reserve(*Size);

if (FuncOffsetsOrdered) {
OrderedFuncOffsets =
std::make_unique<std::vector<std::pair<SampleContext, uint64_t>>>();
OrderedFuncOffsets->reserve(*Size);
}

for (uint32_t I = 0; I < *Size; ++I) {
auto FName(readSampleContextFromTable());
if (std::error_code EC = FName.getError())
auto FContext(readSampleContextFromTable());
if (std::error_code EC = FContext.getError())
return EC;

auto Offset = readNumber<uint64_t>();
if (std::error_code EC = Offset.getError())
return EC;

FuncOffsetTable[*FName] = *Offset;
FuncOffsetTable[*FContext] = *Offset;
if (FuncOffsetsOrdered)
OrderedFuncOffsets->emplace_back(*FContext, *Offset);
}

return sampleprof_error::success;
}

Expand Down Expand Up @@ -760,42 +771,43 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
}

if (ProfileIsCS) {
// Compute the ordered set of names, so we can
// get all context profiles under a subtree by
// iterating through the ordered names.
std::set<SampleContext> OrderedContexts;
for (auto Name : FuncOffsetTable) {
OrderedContexts.insert(Name.first);
}

DenseSet<uint64_t> FuncGuidsToUse;
if (useMD5()) {
for (auto Name : FuncsToUse)
FuncGuidsToUse.insert(Function::getGUID(Name));
}

// For each function in current module, load all
// context profiles for the function.
for (auto NameOffset : FuncOffsetTable) {
SampleContext FContext = NameOffset.first;
auto FuncName = FContext.getName();
if ((useMD5() && !FuncGuidsToUse.count(std::stoull(FuncName.data()))) ||
(!useMD5() && !FuncsToUse.count(FuncName) &&
(!Remapper || !Remapper->exist(FuncName))))
continue;

// For each context profile we need, try to load
// all context profile in the subtree. This can
// help profile guided importing for ThinLTO.
auto It = OrderedContexts.find(FContext);
while (It != OrderedContexts.end() && FContext.IsPrefixOf(*It)) {
const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It];
// For each function in current module, load all context profiles for
// the function as well as their callee contexts which can help profile
// guided importing for ThinLTO. This can be achieved by walking
// through an ordered context container, where contexts are laid out
// as if they were walked in preorder of a context trie. While
// traversing the trie, a link to the highest common ancestor node is
// kept so that all of its decendants will be loaded.
assert(OrderedFuncOffsets.get() &&
"func offset table should always be sorted in CS profile");
const SampleContext *CommonContext = nullptr;
for (const auto &NameOffset : *OrderedFuncOffsets) {
const auto &FContext = NameOffset.first;
auto FName = FContext.getName();
// For function in the current module, keep its farthest ancestor
// context. This can be used to load itself and its child and
// sibling contexts.
if ((useMD5() && FuncGuidsToUse.count(std::stoull(FName.data()))) ||
(!useMD5() && (FuncsToUse.count(FName) ||
(Remapper && Remapper->exist(FName))))) {
if (!CommonContext || !CommonContext->IsPrefixOf(FContext))
CommonContext = &FContext;
}

if (CommonContext == &FContext ||
(CommonContext && CommonContext->IsPrefixOf(FContext))) {
// Load profile for the current context which originated from
// the common ancestor.
const uint8_t *FuncProfileAddr = Start + NameOffset.second;
assert(FuncProfileAddr < End && "out of LBRProfile section");
if (std::error_code EC = readFuncProfile(FuncProfileAddr))
return EC;
// Remove loaded context profile so we won't
// load it repeatedly.
It = OrderedContexts.erase(It);
}
}
} else {
Expand Down Expand Up @@ -1212,6 +1224,10 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
Flags.append("fs-discriminator,");
break;
case SecFuncOffsetTable:
if (hasSecFlag(Entry, SecFuncOffsetFlags::SecFlagOrdered))
Flags.append("ordered,");
break;
default:
break;
}
Expand Down
26 changes: 23 additions & 3 deletions llvm/lib/ProfileData/SampleProfWriter.cpp
Expand Up @@ -165,11 +165,31 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() {
encodeULEB128(FuncOffsetTable.size(), OS);

// Write out FuncOffsetTable.
for (auto Entry : FuncOffsetTable) {
if (std::error_code EC = writeContextIdx(Entry.first))
auto WriteItem = [&](const SampleContext &Context, uint64_t Offset) {
if (std::error_code EC = writeContextIdx(Context))
return EC;
encodeULEB128(Entry.second, OS);
encodeULEB128(Offset, OS);
return (std::error_code)sampleprof_error::success;
};

if (FunctionSamples::ProfileIsCS) {
// Sort the contexts before writing them out. This is to help fast load all
// context profiles for a function as well as their callee contexts which
// can help profile-guided importing for ThinLTO.
std::map<SampleContext, uint64_t> OrderedFuncOffsetTable(
FuncOffsetTable.begin(), FuncOffsetTable.end());
for (const auto &Entry : OrderedFuncOffsetTable) {
if (std::error_code EC = WriteItem(Entry.first, Entry.second))
return EC;
}
addSectionFlag(SecFuncOffsetTable, SecFuncOffsetFlags::SecFlagOrdered);
} else {
for (const auto &Entry : FuncOffsetTable) {
if (std::error_code EC = WriteItem(Entry.first, Entry.second))
return EC;
}
}

FuncOffsetTable.clear();
return sampleprof_error::success;
}
Expand Down
4 changes: 4 additions & 0 deletions llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
Expand Up @@ -2,8 +2,11 @@
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
; RUN: llvm-profdata merge --sample --extbinary %S/Inputs/csspgo-import-list.prof -o %t.prof
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.prof -S | FileCheck %s
; RUN: llvm-profdata show --sample -show-sec-info-only %t.prof | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: llvm-profdata merge --sample --extbinary --use-md5 %S/Inputs/csspgo-import-list.prof -o %t.md5
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%t.md5 -S | FileCheck %s
; RUN: llvm-profdata show --sample -show-sec-info-only %t.md5 | FileCheck %s --check-prefix=CHECK-ORDERED


declare i32 @_Z5funcBi(i32 %x)
declare i32 @_Z5funcAi(i32 %x)
Expand Down Expand Up @@ -32,6 +35,7 @@ for.body: ; preds = %for.body, %entry
; CHECK: distinct !DISubprogram(name: "main"
; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521, i64 -2862076748587597320}

; CHECK-ORDERED: FuncOffsetTableSection {{.*}} {ordered}

attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }

Expand Down

0 comments on commit f4711e0

Please sign in to comment.