161 changes: 130 additions & 31 deletions llvm/tools/llvm-profgen/PerfReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,27 @@ void VirtualUnwinder::unwindLinear(UnwindState &State, uint64_t Repeat) {
InstructionPointer &IP = State.InstPtr;
uint64_t Target = State.getCurrentLBRTarget();
uint64_t End = IP.Address;
// Unwind linear execution part
while (IP.Address >= Target) {
uint64_t PrevIP = IP.Address;
IP.backward();
// Break into segments for implicit call/return due to inlining
bool SameInlinee =
State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
if (!SameInlinee || PrevIP == Target) {
recordRangeCount(PrevIP, End, State, Repeat);
End = IP.Address;
if (State.getBinary()->usePseudoProbes()) {
// The outcome of the virtual unwinding with pseudo probes is a
// map from a context key to the address range being unwound.
// This means basically linear unwinding is not needed for pseudo
// probes. The range will be simply recorded here and will be
// converted to a list of pseudo probes to report in ProfileGenerator.
recordRangeCount(Target, End, State, Repeat);
} else {
// Unwind linear execution part
while (IP.Address >= Target) {
uint64_t PrevIP = IP.Address;
IP.backward();
// Break into segments for implicit call/return due to inlining
bool SameInlinee =
State.getBinary()->inlineContextEqual(PrevIP, IP.Address);
if (!SameInlinee || PrevIP == Target) {
recordRangeCount(PrevIP, End, State, Repeat);
End = IP.Address;
}
State.CallStack.front() = IP.Address;
}
State.CallStack.front() = IP.Address;
}
}

Expand All @@ -72,26 +81,76 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) {
State.InstPtr.update(Source);
}

SampleCounter &
VirtualUnwinder::getOrCreateCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack) {
if (Binary->usePseudoProbes()) {
return getOrCreateCounterForProbe(Binary, CallStack);
}
std::shared_ptr<StringBasedCtxKey> KeyStr =
std::make_shared<StringBasedCtxKey>();
KeyStr->Context = Binary->getExpandedContextStr(CallStack);
KeyStr->genHashCode();
auto Ret =
CtxCounterMap->emplace(Hashable<ContextKey>(KeyStr), SampleCounter());
return Ret.first->second;
}

SampleCounter &
VirtualUnwinder::getOrCreateCounterForProbe(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack) {
std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
std::make_shared<ProbeBasedCtxKey>();
if (CallStack.size() > 1) {
// We don't need to top frame probe since it should be extracted
// from the range.
// The top of stack is an instruction from the function where
// the LBR address range physcially resides. Strip it since
// the function is not a part of the call context. We also
// don't need its inline context since the probes being unwound
// come with an inline context all the way back to the uninlined
// function in their prefix tree.
auto Iter = CallStack.rbegin();
auto EndT = std::prev(CallStack.rend());
for (; Iter != EndT; Iter++) {
uint64_t Address = *Iter;
const PseudoProbe *CallProbe = Binary->getCallProbeForAddr(Address);
// We may not find a probe for a merged or external callsite.
// Callsite merging may cause the loss of original probe IDs.
// Cutting off the context from here since the inline will
// not know how to consume a context with unknown callsites.
if (!CallProbe)
break;
ProbeBasedKey->Probes.emplace_back(CallProbe);
}
}
ProbeBasedKey->genHashCode();
Hashable<ContextKey> ContextId(ProbeBasedKey);
auto Ret = CtxCounterMap->emplace(ContextId, SampleCounter());
return Ret.first->second;
}

void VirtualUnwinder::recordRangeCount(uint64_t Start, uint64_t End,
UnwindState &State, uint64_t Repeat) {
std::string &&ContextId = State.getExpandedContextStr();
uint64_t StartOffset = State.getBinary()->virtualAddrToOffset(Start);
uint64_t EndOffset = State.getBinary()->virtualAddrToOffset(End);
SampleCounters->recordRangeCount(ContextId, StartOffset, EndOffset, Repeat);
SampleCounter &SCounter =
getOrCreateCounter(State.getBinary(), State.CallStack);
SCounter.recordRangeCount(StartOffset, EndOffset, Repeat);
}

void VirtualUnwinder::recordBranchCount(const LBREntry &Branch,
UnwindState &State, uint64_t Repeat) {
if (Branch.IsArtificial)
return;
std::string &&ContextId = State.getExpandedContextStr();
uint64_t SourceOffset = State.getBinary()->virtualAddrToOffset(Branch.Source);
uint64_t TargetOffset = State.getBinary()->virtualAddrToOffset(Branch.Target);
SampleCounters->recordBranchCount(ContextId, SourceOffset, TargetOffset,
Repeat);
SampleCounter &SCounter =
getOrCreateCounter(State.getBinary(), State.CallStack);
SCounter.recordBranchCount(SourceOffset, TargetOffset, Repeat);
}

bool VirtualUnwinder::unwind(const HybridSample &Sample, uint64_t Repeat) {
bool VirtualUnwinder::unwind(const HybridSample *Sample, uint64_t Repeat) {
// Capture initial state as starting point for unwinding.
UnwindState State(Sample);

Expand Down Expand Up @@ -198,10 +257,10 @@ ProfiledBinary *PerfReader::getBinary(uint64_t Address) {
return Iter->second;
}

static void printSampleCounter(ContextRangeCounter &Counter) {
// Use ordered map to make the output deterministic
std::map<std::string, RangeSample> OrderedCounter(Counter.begin(),
Counter.end());
// Use ordered map to make the output deterministic
using OrderedCounterForPrint = std::map<std::string, RangeSample>;

static void printSampleCounter(OrderedCounterForPrint &OrderedCounter) {
for (auto Range : OrderedCounter) {
outs() << Range.first << "\n";
for (auto I : Range.second) {
Expand All @@ -211,20 +270,59 @@ static void printSampleCounter(ContextRangeCounter &Counter) {
}
}

static std::string getContextKeyStr(ContextKey *K,
const ProfiledBinary *Binary) {
std::string ContextStr;
if (const auto *CtxKey = dyn_cast<StringBasedCtxKey>(K)) {
return CtxKey->Context;
} else if (const auto *CtxKey = dyn_cast<ProbeBasedCtxKey>(K)) {
SmallVector<std::string, 16> ContextStack;
for (const auto *Probe : CtxKey->Probes) {
Binary->getInlineContextForProbe(Probe, ContextStack, true);
}
for (const auto &Context : ContextStack) {
if (ContextStr.size())
ContextStr += " @ ";
ContextStr += Context;
}
}
return ContextStr;
}

static void printRangeCounter(ContextSampleCounterMap &Counter,
const ProfiledBinary *Binary) {
OrderedCounterForPrint OrderedCounter;
for (auto &CI : Counter) {
OrderedCounter[getContextKeyStr(CI.first.getPtr(), Binary)] =
CI.second.RangeCounter;
}
printSampleCounter(OrderedCounter);
}

static void printBranchCounter(ContextSampleCounterMap &Counter,
const ProfiledBinary *Binary) {
OrderedCounterForPrint OrderedCounter;
for (auto &CI : Counter) {
OrderedCounter[getContextKeyStr(CI.first.getPtr(), Binary)] =
CI.second.BranchCounter;
}
printSampleCounter(OrderedCounter);
}

void PerfReader::printUnwinderOutput() {
for (auto I : BinarySampleCounters) {
const ProfiledBinary *Binary = I.first;
outs() << "Binary(" << Binary->getName().str() << ")'s Range Counter:\n";
printSampleCounter(I.second.RangeCounter);
printRangeCounter(I.second, Binary);
outs() << "\nBinary(" << Binary->getName().str() << ")'s Branch Counter:\n";
printSampleCounter(I.second.BranchCounter);
printBranchCounter(I.second, Binary);
}
}

void PerfReader::unwindSamples() {
for (const auto &Item : AggregatedSamples) {
const HybridSample &Sample = Item.first;
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample.Binary]);
const HybridSample *Sample = dyn_cast<HybridSample>(Item.first.getPtr());
VirtualUnwinder Unwinder(&BinarySampleCounters[Sample->Binary]);
Unwinder.unwind(Sample, Item.second);
}

Expand Down Expand Up @@ -366,26 +464,27 @@ void PerfReader::parseHybridSample(TraceStream &TraceIt) {
// 0x4005c8/0x4005dc/P/-/-/0 0x40062f/0x4005b0/P/-/-/0 ...
// ... 0x4005c8/0x4005dc/P/-/-/0 # LBR Entries
//
HybridSample Sample;
std::shared_ptr<HybridSample> Sample = std::make_shared<HybridSample>();

// Parsing call stack and populate into HybridSample.CallStack
if (!extractCallstack(TraceIt, Sample.CallStack)) {
if (!extractCallstack(TraceIt, Sample->CallStack)) {
// Skip the next LBR line matched current call stack
if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x"))
TraceIt.advance();
return;
}
// Set the binary current sample belongs to
Sample.Binary = getBinary(Sample.CallStack.front());
Sample->Binary = getBinary(Sample->CallStack.front());

if (!TraceIt.isAtEoF() && TraceIt.getCurrentLine().startswith(" 0x")) {
// Parsing LBR stack and populate into HybridSample.LBRStack
if (extractLBRStack(TraceIt, Sample.LBRStack, Sample.Binary)) {
if (extractLBRStack(TraceIt, Sample->LBRStack, Sample->Binary)) {
// Canonicalize stack leaf to avoid 'random' IP from leaf frame skew LBR
// ranges
Sample.CallStack.front() = Sample.LBRStack[0].Target;
Sample->CallStack.front() = Sample->LBRStack[0].Target;
// Record samples by aggregation
AggregatedSamples[Sample]++;
Sample->genHashCode();
AggregatedSamples[Hashable<PerfSample>(Sample)]++;
}
} else {
// LBR sample is encoded in single line after stack sample
Expand Down
240 changes: 187 additions & 53 deletions llvm/tools/llvm-profgen/PerfReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define LLVM_TOOLS_LLVM_PROFGEN_PERFREADER_H
#include "ErrorHandling.h"
#include "ProfiledBinary.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Regex.h"
#include <fstream>
Expand Down Expand Up @@ -75,21 +76,79 @@ struct LBREntry {
: Source(S), Target(T), IsArtificial(I) {}
};

// Hash interface for generic data of type T
// Data should implement a \fn getHashCode and a \fn isEqual
// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
// i.e we explicitly calculate hash of derived class, assign to base class's
// HashCode. This also provides the flexibility for calculating the hash code
// incrementally(like rolling hash) during frame stack unwinding since unwinding
// only changes the leaf of frame stack. \fn isEqual is a virtual function,
// which will have perf overhead. In the future, if we redesign a better hash
// function, then we can just skip this or switch to non-virtual function(like
// just ignore comparision if hash conflicts probabilities is low)
template <class T> class Hashable {
public:
std::shared_ptr<T> Data;
Hashable(const std::shared_ptr<T> &D) : Data(D) {}

// Hash code generation
struct Hash {
uint64_t operator()(const Hashable<T> &Key) const {
// Don't make it virtual for getHashCode
assert(Key.Data->getHashCode() && "Should generate HashCode for it!");
return Key.Data->getHashCode();
}
};

// Hash equal
struct Equal {
bool operator()(const Hashable<T> &LHS, const Hashable<T> &RHS) const {
// Precisely compare the data, vtable will have overhead.
return LHS.Data->isEqual(RHS.Data.get());
}
};

T *getPtr() const { return Data.get(); }
};

// Base class to extend for all types of perf sample
struct PerfSample {
uint64_t HashCode = 0;

virtual ~PerfSample() = default;
uint64_t getHashCode() const { return HashCode; }
virtual bool isEqual(const PerfSample *K) const {
return HashCode == K->HashCode;
};

// Utilities for LLVM-style RTTI
enum PerfKind { PK_HybridSample };
const PerfKind Kind;
PerfKind getKind() const { return Kind; }
PerfSample(PerfKind K) : Kind(K){};
};

// The parsed hybrid sample including call stack and LBR stack.
struct HybridSample {
struct HybridSample : public PerfSample {
// Profiled binary that current frame address belongs to
ProfiledBinary *Binary;
// Call stack recorded in FILO(leaf to root) order
std::list<uint64_t> CallStack;
// LBR stack recorded in FIFO order
SmallVector<LBREntry, 16> LBRStack;

HybridSample() : PerfSample(PK_HybridSample){};
static bool classof(const PerfSample *K) {
return K->getKind() == PK_HybridSample;
}

// Used for sample aggregation
bool operator==(const HybridSample &Other) const {
if (Other.Binary != Binary)
bool isEqual(const PerfSample *K) const override {
const HybridSample *Other = dyn_cast<HybridSample>(K);
if (Other->Binary != Binary)
return false;
const std::list<uint64_t> &OtherCallStack = Other.CallStack;
const SmallVector<LBREntry, 16> &OtherLBRStack = Other.LBRStack;
const std::list<uint64_t> &OtherCallStack = Other->CallStack;
const SmallVector<LBREntry, 16> &OtherLBRStack = Other->LBRStack;

if (CallStack.size() != OtherCallStack.size() ||
LBRStack.size() != OtherLBRStack.size())
Expand All @@ -108,8 +167,32 @@ struct HybridSample {
}
return true;
}

void genHashCode() {
// Use simple DJB2 hash
auto HashCombine = [](uint64_t H, uint64_t V) {
return ((H << 5) + H) + V;
};
uint64_t Hash = 5381;
Hash = HashCombine(Hash, reinterpret_cast<uint64_t>(Binary));
for (const auto &Value : CallStack) {
Hash = HashCombine(Hash, Value);
}
for (const auto &Entry : LBRStack) {
Hash = HashCombine(Hash, Entry.Source);
Hash = HashCombine(Hash, Entry.Target);
}
HashCode = Hash;
}
};

// After parsing the sample, we record the samples by aggregating them
// into this counter. The key stores the sample data and the value is
// the sample repeat times.
using AggregatedCounter =
std::unordered_map<Hashable<PerfSample>, uint64_t,
Hashable<PerfSample>::Hash, Hashable<PerfSample>::Equal>;

// The state for the unwinder, it doesn't hold the data but only keep the
// pointer/index of the data, While unwinding, the CallStack is changed
// dynamicially and will be recorded as the context of the sample
Expand All @@ -124,10 +207,10 @@ struct UnwindState {
const SmallVector<LBREntry, 16> &LBRStack;
// Used to iterate the address range
InstructionPointer InstPtr;
UnwindState(const HybridSample &Sample)
: Binary(Sample.Binary), CallStack(Sample.CallStack),
LBRStack(Sample.LBRStack),
InstPtr(Sample.Binary, Sample.CallStack.front()) {}
UnwindState(const HybridSample *Sample)
: Binary(Sample->Binary), CallStack(Sample->CallStack),
LBRStack(Sample->LBRStack),
InstPtr(Sample->Binary, Sample->CallStack.front()) {}

bool validateInitialState() {
uint64_t LBRLeaf = LBRStack[LBRIndex].Target;
Expand Down Expand Up @@ -160,56 +243,92 @@ struct UnwindState {
void advanceLBR() { LBRIndex++; }
};

// The counter of branch samples for one function indexed by the branch,
// which is represented as the source and target offset pair.
using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// The counter of range samples for one function indexed by the range,
// which is represented as the start and end offset pair.
using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// Range sample counters indexed by the context string
using ContextRangeCounter = std::unordered_map<std::string, RangeSample>;
// Branch sample counters indexed by the context string
using ContextBranchCounter = std::unordered_map<std::string, BranchSample>;

// For Hybrid sample counters
struct ContextSampleCounters {
ContextRangeCounter RangeCounter;
ContextBranchCounter BranchCounter;

void recordRangeCount(std::string &ContextId, uint64_t Start, uint64_t End,
uint64_t Repeat) {
RangeCounter[ContextId][{Start, End}] += Repeat;
// Base class for sample counter key with context
struct ContextKey {
uint64_t HashCode = 0;
virtual ~ContextKey() = default;
uint64_t getHashCode() const { return HashCode; }
virtual bool isEqual(const ContextKey *K) const {
return HashCode == K->HashCode;
};

// Utilities for LLVM-style RTTI
enum ContextKind { CK_StringBased, CK_ProbeBased };
const ContextKind Kind;
ContextKind getKind() const { return Kind; }
ContextKey(ContextKind K) : Kind(K){};
};

// String based context id
struct StringBasedCtxKey : public ContextKey {
std::string Context;
StringBasedCtxKey() : ContextKey(CK_StringBased){};
static bool classof(const ContextKey *K) {
return K->getKind() == CK_StringBased;
}
void recordBranchCount(std::string &ContextId, uint64_t Source,
uint64_t Target, uint64_t Repeat) {
BranchCounter[ContextId][{Source, Target}] += Repeat;

bool isEqual(const ContextKey *K) const override {
const StringBasedCtxKey *Other = dyn_cast<StringBasedCtxKey>(K);
return Context == Other->Context;
}

void genHashCode() { HashCode = hash_value(Context); }
};

struct HybridSampleHash {
uint64_t hashCombine(uint64_t Hash, uint64_t Value) const {
// Simple DJB2 hash
return ((Hash << 5) + Hash) + Value;
// Probe based context key as the intermediate key of context
// String based context key will introduce redundant string handling
// since the callee context is inferred from the context string which
// need to be splitted by '@' to get the last location frame, so we
// can just use probe instead and generate the string in the end.
struct ProbeBasedCtxKey : public ContextKey {
SmallVector<const PseudoProbe *, 16> Probes;

ProbeBasedCtxKey() : ContextKey(CK_ProbeBased) {}
static bool classof(const ContextKey *K) {
return K->getKind() == CK_ProbeBased;
}

uint64_t operator()(const HybridSample &Sample) const {
uint64_t Hash = 5381;
Hash = hashCombine(Hash, reinterpret_cast<uint64_t>(Sample.Binary));
for (const auto &Value : Sample.CallStack) {
Hash = hashCombine(Hash, Value);
bool isEqual(const ContextKey *K) const override {
const ProbeBasedCtxKey *O = dyn_cast<ProbeBasedCtxKey>(K);
assert(O != nullptr && "Probe based key shouldn't be null in isEqual");
return std::equal(Probes.begin(), Probes.end(), O->Probes.begin(),
O->Probes.end());
}

void genHashCode() {
for (const auto *P : Probes) {
HashCode = hash_combine(HashCode, P);
}
for (const auto &Entry : Sample.LBRStack) {
Hash = hashCombine(Hash, Entry.Source);
Hash = hashCombine(Hash, Entry.Target);
if (HashCode == 0) {
// Avoid zero value of HashCode when it's an empty list
HashCode = 1;
}
return Hash;
}
};

// After parsing the sample, we record the samples by aggregating them
// into this structure and the value is the sample counter.
using AggregationCounter =
std::unordered_map<HybridSample, uint64_t, HybridSampleHash>;
// The counter of branch samples for one function indexed by the branch,
// which is represented as the source and target offset pair.
using BranchSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// The counter of range samples for one function indexed by the range,
// which is represented as the start and end offset pair.
using RangeSample = std::map<std::pair<uint64_t, uint64_t>, uint64_t>;
// Wrapper for sample counters including range counter and branch counter
struct SampleCounter {
RangeSample RangeCounter;
BranchSample BranchCounter;

void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) {
RangeCounter[{Start, End}] += Repeat;
}
void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) {
BranchCounter[{Source, Target}] += Repeat;
}
};

// Sample counter with context to support context-sensitive profile
using ContextSampleCounterMap =
std::unordered_map<Hashable<ContextKey>, SampleCounter,
Hashable<ContextKey>::Hash, Hashable<ContextKey>::Equal>;

/*
As in hybrid sample we have a group of LBRs and the most recent sampling call
Expand All @@ -232,7 +351,7 @@ range as sample counter for further CS profile generation.
*/
class VirtualUnwinder {
public:
VirtualUnwinder(ContextSampleCounters *Counters) : SampleCounters(Counters) {}
VirtualUnwinder(ContextSampleCounterMap *Counter) : CtxCounterMap(Counter) {}

bool isCallState(UnwindState &State) const {
// The tail call frame is always missing here in stack sample, we will
Expand All @@ -250,14 +369,29 @@ class VirtualUnwinder {
void unwindLinear(UnwindState &State, uint64_t Repeat);
void unwindReturn(UnwindState &State);
void unwindBranchWithinFrame(UnwindState &State);
bool unwind(const HybridSample &Sample, uint64_t Repeat);
bool unwind(const HybridSample *Sample, uint64_t Repeat);
void recordRangeCount(uint64_t Start, uint64_t End, UnwindState &State,
uint64_t Repeat);
void recordBranchCount(const LBREntry &Branch, UnwindState &State,
uint64_t Repeat);
SampleCounter &getOrCreateCounter(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack);
// Use pseudo probe based context key to get the sample counter
// A context stands for a call path from 'main' to an uninlined
// callee with all inline frames recovered on that path. The probes
// belonging to that call path is the probes either originated from
// the callee or from any functions inlined into the callee. Since
// pseudo probes are organized in a tri-tree style after decoded,
// the tree path from the tri-tree root (which is the uninlined
// callee) to the probe node forms an inline context.
// Here we use a list of probe(pointer) as the context key to speed up
// aggregation and the final context string will be generate in
// ProfileGenerator
SampleCounter &getOrCreateCounterForProbe(const ProfiledBinary *Binary,
std::list<uint64_t> &CallStack);

private:
ContextSampleCounters *SampleCounters;
ContextSampleCounterMap *CtxCounterMap;
};

// Filename to binary map
Expand All @@ -268,7 +402,7 @@ using AddressBinaryMap = std::map<uint64_t, ProfiledBinary *>;
// same binary loaded at different addresses, they should share the same sample
// counter
using BinarySampleCounterMap =
std::unordered_map<ProfiledBinary *, ContextSampleCounters>;
std::unordered_map<ProfiledBinary *, ContextSampleCounterMap>;

// Load binaries and read perf trace to parse the events and samples
class PerfReader {
Expand Down Expand Up @@ -344,7 +478,7 @@ class PerfReader {
private:
BinarySampleCounterMap BinarySampleCounters;
// Samples with the repeating time generated by the perf reader
AggregationCounter AggregatedSamples;
AggregatedCounter AggregatedSamples;
PerfScriptType PerfType;
};

Expand Down
163 changes: 77 additions & 86 deletions llvm/tools/llvm-profgen/ProfileGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,23 @@ using namespace sampleprof;
namespace llvm {
namespace sampleprof {

static bool
usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {
return BinarySampleCounters.size() &&
BinarySampleCounters.begin()->first->usePseudoProbes();
}

std::unique_ptr<ProfileGenerator>
ProfileGenerator::create(const BinarySampleCounterMap &BinarySampleCounters,
enum PerfScriptType SampleType) {
std::unique_ptr<ProfileGenerator> ProfileGenerator;

if (SampleType == PERF_LBR_STACK) {
ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters));
if (usePseudoProbes(BinarySampleCounters)) {
ProfileGenerator.reset(
new PseudoProbeCSProfileGenerator(BinarySampleCounters));
} else {
ProfileGenerator.reset(new CSProfileGenerator(BinarySampleCounters));
}
} else {
// TODO:
llvm_unreachable("Unsupported perfscript!");
Expand Down Expand Up @@ -178,95 +188,76 @@ void CSProfileGenerator::updateBodySamplesforFunctionProfile(
}
}

void CSProfileGenerator::populateFunctionBodySamples() {
for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second.RangeCounter) {
StringRef ContextId(CI.first);
// Get or create function profile for the range
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);
// Compute disjoint ranges first, so we can use MAX
// for calculating count for each location.
RangeSample Ranges;
findDisjointRanges(Ranges, CI.second);

for (auto Range : Ranges) {
uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
uint64_t Count = Range.second;
// Disjoint ranges have introduce zero-filled gap that
// doesn't belong to current context, filter them out.
if (Count == 0)
continue;

InstructionPointer IP(Binary, RangeBegin, true);

// Disjoint ranges may have range in the middle of two instr,
// e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
// can be Addr1+1 to Addr2-1. We should ignore such range.
if (IP.Address > RangeEnd)
continue;

while (IP.Address <= RangeEnd) {
uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
// Recording body sample for this specific context
updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
// Move to next IP within the range
IP.advance();
}
}
void CSProfileGenerator::populateFunctionBodySamples(
FunctionSamples &FunctionProfile, const RangeSample &RangeCounter,
ProfiledBinary *Binary) {
// Compute disjoint ranges first, so we can use MAX
// for calculating count for each location.
RangeSample Ranges;
findDisjointRanges(Ranges, RangeCounter);
for (auto Range : Ranges) {
uint64_t RangeBegin = Binary->offsetToVirtualAddr(Range.first.first);
uint64_t RangeEnd = Binary->offsetToVirtualAddr(Range.first.second);
uint64_t Count = Range.second;
// Disjoint ranges have introduce zero-filled gap that
// doesn't belong to current context, filter them out.
if (Count == 0)
continue;

InstructionPointer IP(Binary, RangeBegin, true);

// Disjoint ranges may have range in the middle of two instr,
// e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
// can be Addr1+1 to Addr2-1. We should ignore such range.
if (IP.Address > RangeEnd)
continue;

while (IP.Address <= RangeEnd) {
uint64_t Offset = Binary->virtualAddrToOffset(IP.Address);
const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(Offset);
// Recording body sample for this specific context
updateBodySamplesforFunctionProfile(FunctionProfile, LeafLoc, Count);
// Move to next IP within the range
IP.advance();
}
}
}

void CSProfileGenerator::populateFunctionBoundarySamples() {
for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second.BranchCounter) {
StringRef ContextId(CI.first);
// Get or create function profile for branch Source
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);

for (auto Entry : CI.second) {
uint64_t SourceOffset = Entry.first.first;
uint64_t TargetOffset = Entry.first.second;
uint64_t Count = Entry.second;
// Get the callee name by branch target if it's a call branch
StringRef CalleeName = FunctionSamples::getCanonicalFnName(
Binary->getFuncFromStartOffset(TargetOffset));
if (CalleeName.size() == 0)
continue;

// Record called target sample and its count
const FrameLocation &LeafLoc =
Binary->getInlineLeafFrameLoc(SourceOffset);

FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
LeafLoc.second.Discriminator,
CalleeName, Count);
FunctionProfile.addTotalSamples(Count);

// Record head sample for called target(callee)
// TODO: Cleanup ' @ '
std::string CalleeContextId =
getCallSite(LeafLoc) + " @ " + CalleeName.str();
if (ContextId.find(" @ ") != StringRef::npos) {
CalleeContextId =
ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
}

if (ProfileMap.find(CalleeContextId) != ProfileMap.end()) {
FunctionSamples &CalleeProfile = ProfileMap[CalleeContextId];
assert(Count != 0 && "Unexpected zero weight branch");
if (CalleeProfile.getName().size()) {
CalleeProfile.addHeadSamples(Count);
}
}
}
void CSProfileGenerator::populateFunctionBoundarySamples(
StringRef ContextId, FunctionSamples &FunctionProfile,
const BranchSample &BranchCounters, ProfiledBinary *Binary) {

for (auto Entry : BranchCounters) {
uint64_t SourceOffset = Entry.first.first;
uint64_t TargetOffset = Entry.first.second;
uint64_t Count = Entry.second;
// Get the callee name by branch target if it's a call branch
StringRef CalleeName = FunctionSamples::getCanonicalFnName(
Binary->getFuncFromStartOffset(TargetOffset));
if (CalleeName.size() == 0)
continue;

// Record called target sample and its count
const FrameLocation &LeafLoc = Binary->getInlineLeafFrameLoc(SourceOffset);

FunctionProfile.addCalledTargetSamples(LeafLoc.second.LineOffset,
LeafLoc.second.Discriminator,
CalleeName, Count);
FunctionProfile.addTotalSamples(Count);

// Record head sample for called target(callee)
// TODO: Cleanup ' @ '
std::string CalleeContextId =
getCallSite(LeafLoc) + " @ " + CalleeName.str();
if (ContextId.find(" @ ") != StringRef::npos) {
CalleeContextId =
ContextId.rsplit(" @ ").first.str() + " @ " + CalleeContextId;
}

FunctionSamples &CalleeProfile =
getFunctionProfileForContext(CalleeContextId);
assert(Count != 0 && "Unexpected zero weight branch");
CalleeProfile.addHeadSamples(Count);
}
}

Expand Down
44 changes: 36 additions & 8 deletions llvm/tools/llvm-profgen/ProfileGenerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class ProfileGenerator {
};

class CSProfileGenerator : public ProfileGenerator {
protected:
const BinarySampleCounterMap &BinarySampleCounters;

public:
Expand All @@ -64,12 +65,24 @@ class CSProfileGenerator : public ProfileGenerator {

public:
void generateProfile() override {
// Fill in function body samples
populateFunctionBodySamples();

// Fill in boundary sample counts as well as call site samples for calls
populateFunctionBoundarySamples();

for (const auto &BI : BinarySampleCounters) {
ProfiledBinary *Binary = BI.first;
for (const auto &CI : BI.second) {
const StringBasedCtxKey *CtxKey =
dyn_cast<StringBasedCtxKey>(CI.first.getPtr());
StringRef ContextId(CtxKey->Context);
// Get or create function profile for the range
FunctionSamples &FunctionProfile =
getFunctionProfileForContext(ContextId);

// Fill in function body samples
populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter,
Binary);
// Fill in boundary sample counts as well as call site samples for calls
populateFunctionBoundarySamples(ContextId, FunctionProfile,
CI.second.BranchCounter, Binary);
}
}
// Fill in call site value sample for inlined calls and also use context to
// infer missing samples. Since we don't have call count for inlined
// functions, we estimate it from inlinee's profile using the entry of the
Expand All @@ -85,11 +98,26 @@ class CSProfileGenerator : public ProfileGenerator {
uint64_t Count);
// Lookup or create FunctionSamples for the context
FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
void populateFunctionBodySamples();
void populateFunctionBoundarySamples();
void populateFunctionBodySamples(FunctionSamples &FunctionProfile,
const RangeSample &RangeCounters,
ProfiledBinary *Binary);
void populateFunctionBoundarySamples(StringRef ContextId,
FunctionSamples &FunctionProfile,
const BranchSample &BranchCounters,
ProfiledBinary *Binary);
void populateInferredFunctionSamples();
};

class PseudoProbeCSProfileGenerator : public CSProfileGenerator {

public:
PseudoProbeCSProfileGenerator(const BinarySampleCounterMap &Counters)
: CSProfileGenerator(Counters) {}
void generateProfile() override {
// TODO
}
};

} // end namespace sampleprof
} // end namespace llvm

Expand Down
36 changes: 35 additions & 1 deletion llvm/tools/llvm-profgen/ProfiledBinary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ static cl::opt<bool> ShowSourceLocations("show-source-locations",
cl::ZeroOrMore,
cl::desc("Print source locations."));

static cl::opt<bool> ShowPseudoProbe(
"show-pseudo-probe", cl::ReallyHidden, cl::init(false), cl::ZeroOrMore,
cl::desc("Print pseudo probe section and disassembled info."));

namespace llvm {
namespace sampleprof {

Expand Down Expand Up @@ -93,6 +97,9 @@ void ProfiledBinary::load() {
// Find the preferred base address for text sections.
setPreferredBaseAddress(Obj);

// Decode pseudo probe related section
decodePseudoProbe(Obj);

// Disassemble the text sections.
disassemble(Obj);

Expand Down Expand Up @@ -149,7 +156,6 @@ ProfiledBinary::getExpandedContextStr(const std::list<uint64_t> &Stack) const {
OContextStr << ContextVec[I];
}
}

return OContextStr.str();
}

Expand All @@ -165,6 +171,30 @@ void ProfiledBinary::setPreferredBaseAddress(const ELFObjectFileBase *Obj) {
exitWithError("no text section found", Obj->getFileName());
}

void ProfiledBinary::decodePseudoProbe(const ELFObjectFileBase *Obj) {
StringRef FileName = Obj->getFileName();
for (section_iterator SI = Obj->section_begin(), SE = Obj->section_end();
SI != SE; ++SI) {
const SectionRef &Section = *SI;
StringRef SectionName = unwrapOrError(Section.getName(), FileName);

if (SectionName == ".pseudo_probe_desc") {
StringRef Contents = unwrapOrError(Section.getContents(), FileName);
ProbeDecoder.buildGUID2FuncDescMap(
reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size());
} else if (SectionName == ".pseudo_probe") {
StringRef Contents = unwrapOrError(Section.getContents(), FileName);
ProbeDecoder.buildAddress2ProbeMap(
reinterpret_cast<const uint8_t *>(Contents.data()), Contents.size());
// set UsePseudoProbes flag, used for PerfReader
UsePseudoProbes = true;
}
}

if (ShowPseudoProbe)
ProbeDecoder.printGUID2FuncDescMap(outs());
}

bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
SectionSymbolsTy &Symbols,
const SectionRef &Section) {
Expand Down Expand Up @@ -193,6 +223,10 @@ bool ProfiledBinary::dissassembleSymbol(std::size_t SI, ArrayRef<uint8_t> Bytes,
return false;

if (ShowDisassembly) {
if (ShowPseudoProbe) {
ProbeDecoder.printProbeForAddress(outs(),
Offset + PreferredBaseAddress);
}
outs() << format("%8" PRIx64 ":", Offset);
size_t Start = outs().tell();
IPrinter->printInst(&Inst, Offset + Size, "", *STI.get(), outs());
Expand Down
21 changes: 21 additions & 0 deletions llvm/tools/llvm-profgen/ProfiledBinary.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDBINARY_H

#include "CallContext.h"
#include "PseudoProbe.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/DebugInfo/Symbolize/Symbolize.h"
#include "llvm/MC/MCAsmInfo.h"
Expand Down Expand Up @@ -128,8 +129,16 @@ class ProfiledBinary {

// The symbolizer used to get inline context for an instruction.
std::unique_ptr<symbolize::LLVMSymbolizer> Symbolizer;

// Pseudo probe decoder
PseudoProbeDecoder ProbeDecoder;

bool UsePseudoProbes = false;

void setPreferredBaseAddress(const ELFObjectFileBase *O);

void decodePseudoProbe(const ELFObjectFileBase *Obj);

// Set up disassembler and related components.
void setUpDisassembler(const ELFObjectFileBase *Obj);
void setupSymbolizer();
Expand Down Expand Up @@ -197,6 +206,7 @@ class ProfiledBinary {
return offsetToVirtualAddr(CodeAddrs[Index]);
}

bool usePseudoProbes() const { return UsePseudoProbes; }
// Get the index in CodeAddrs for the address
// As we might get an address which is not the code
// here it would round to the next valid code address by
Expand Down Expand Up @@ -227,6 +237,17 @@ class ProfiledBinary {
// It will search the disassembling info stored in Offset2LocStackMap. This is
// used as the key of function sample map
std::string getExpandedContextStr(const std::list<uint64_t> &stack) const;

const PseudoProbe *getCallProbeForAddr(uint64_t Address) const {
return ProbeDecoder.getCallProbeForAddr(Address);
}
void
getInlineContextForProbe(const PseudoProbe *Probe,
SmallVector<std::string, 16> &InlineContextStack,
bool IncludeLeaf) const {
return ProbeDecoder.getInlineContextForProbe(Probe, InlineContextStack,
IncludeLeaf);
}
};

} // end namespace sampleprof
Expand Down
334 changes: 334 additions & 0 deletions llvm/tools/llvm-profgen/PseudoProbe.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
//===--- PseudoProbe.cpp - Pseudo probe decoding utilities ------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "PseudoProbe.h"
#include "ErrorHandling.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/LEB128.h"
#include "llvm/Support/raw_ostream.h"
#include <limits>
#include <memory>

using namespace llvm;
using namespace sampleprof;
using namespace support;

namespace llvm {
namespace sampleprof {

static StringRef getProbeFNameForGUID(const GUIDProbeFunctionMap &GUID2FuncMAP,
uint64_t GUID) {
auto It = GUID2FuncMAP.find(GUID);
assert(It != GUID2FuncMAP.end() &&
"Probe function must exist for a valid GUID");
return It->second.FuncName;
}

void PseudoProbeFuncDesc::print(raw_ostream &OS) {
OS << "GUID: " << FuncGUID << " Name: " << FuncName << "\n";
OS << "Hash: " << FuncHash << "\n";
}

void PseudoProbe::getInlineContext(SmallVector<std::string, 16> &ContextStack,
const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName) const {
uint32_t Begin = ContextStack.size();
PseudoProbeInlineTree *Cur = InlineTree;
// It will add the string of each node's inline site during iteration.
// Note that it won't include the probe's belonging function(leaf location)
while (!Cur->hasInlineSite()) {
std::string ContextStr;
if (ShowName) {
StringRef FuncName =
getProbeFNameForGUID(GUID2FuncMAP, std::get<0>(Cur->ISite));
ContextStr += FuncName.str();
} else {
ContextStr += Twine(std::get<0>(Cur->ISite)).str();
}
ContextStr += ":";
ContextStr += Twine(std::get<1>(Cur->ISite)).str();
ContextStack.emplace_back(ContextStr);
Cur = Cur->Parent;
}
// Make the ContextStack in caller-callee order
std::reverse(ContextStack.begin() + Begin, ContextStack.end());
}

std::string
PseudoProbe::getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName) const {
std::ostringstream OContextStr;
SmallVector<std::string, 16> ContextStack;
getInlineContext(ContextStack, GUID2FuncMAP, ShowName);
for (auto &CxtStr : ContextStack) {
if (OContextStr.str().size())
OContextStr << " @ ";
OContextStr << CxtStr;
}
return OContextStr.str();
}

static const char *PseudoProbeTypeStr[3] = {"Block", "IndirectCall",
"DirectCall"};

void PseudoProbe::print(raw_ostream &OS,
const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName) {
OS << "FUNC: ";
if (ShowName) {
StringRef FuncName = getProbeFNameForGUID(GUID2FuncMAP, GUID);
OS << FuncName.str() << " ";
} else {
OS << GUID << " ";
}
OS << "Index: " << Index << " ";
OS << "Type: " << PseudoProbeTypeStr[static_cast<uint8_t>(Type)] << " ";
if (isDangling()) {
OS << "Dangling ";
}
if (isTailCall()) {
OS << "TailCall ";
}
std::string InlineContextStr = getInlineContextStr(GUID2FuncMAP, ShowName);
if (InlineContextStr.size()) {
OS << "Inlined: @ ";
OS << InlineContextStr;
}
OS << "\n";
}

template <typename T> T PseudoProbeDecoder::readUnencodedNumber() {
if (Data + sizeof(T) > End) {
exitWithError("Decode unencoded number error in " + SectionName +
" section");
}
T Val = endian::readNext<T, little, unaligned>(Data);
return Val;
}

template <typename T> T PseudoProbeDecoder::readUnsignedNumber() {
unsigned NumBytesRead = 0;
uint64_t Val = decodeULEB128(Data, &NumBytesRead);
if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
exitWithError("Decode number error in " + SectionName + " section");
}
Data += NumBytesRead;
return static_cast<T>(Val);
}

template <typename T> T PseudoProbeDecoder::readSignedNumber() {
unsigned NumBytesRead = 0;
int64_t Val = decodeSLEB128(Data, &NumBytesRead);
if (Val > std::numeric_limits<T>::max() || (Data + NumBytesRead > End)) {
exitWithError("Decode number error in " + SectionName + " section");
}
Data += NumBytesRead;
return static_cast<T>(Val);
}

StringRef PseudoProbeDecoder::readString(uint32_t Size) {
StringRef Str(reinterpret_cast<const char *>(Data), Size);
if (Data + Size > End) {
exitWithError("Decode string error in " + SectionName + " section");
}
Data += Size;
return Str;
}

void PseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
std::size_t Size) {
// The pseudo_probe_desc section has a format like:
// .section .pseudo_probe_desc,"",@progbits
// .quad -5182264717993193164 // GUID
// .quad 4294967295 // Hash
// .uleb 3 // Name size
// .ascii "foo" // Name
// .quad -2624081020897602054
// .quad 174696971957
// .uleb 34
// .ascii "main"
#ifndef NDEBUG
SectionName = "pseudo_probe_desc";
#endif
Data = Start;
End = Data + Size;

while (Data < End) {
uint64_t GUID = readUnencodedNumber<uint64_t>();
uint64_t Hash = readUnencodedNumber<uint64_t>();
uint32_t NameSize = readUnsignedNumber<uint32_t>();
StringRef Name = readString(NameSize);

// Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap
GUID2FuncDescMap.emplace(GUID, PseudoProbeFuncDesc(GUID, Hash, Name));
}
assert(Data == End && "Have unprocessed data in pseudo_probe_desc section");
}

void PseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
std::size_t Size) {
// The pseudo_probe section encodes an inline forest and each tree has a
// format like:
// FUNCTION BODY (one for each uninlined function present in the text
// section)
// GUID (uint64)
// GUID of the function
// NPROBES (ULEB128)
// Number of probes originating from this function.
// NUM_INLINED_FUNCTIONS (ULEB128)
// Number of callees inlined into this function, aka number of
// first-level inlinees
// PROBE RECORDS
// A list of NPROBES entries. Each entry contains:
// INDEX (ULEB128)
// TYPE (uint4)
// 0 - block probe, 1 - indirect call, 2 - direct call
// ATTRIBUTE (uint3)
// 1 - tail call, 2 - dangling
// ADDRESS_TYPE (uint1)
// 0 - code address, 1 - address delta
// CODE_ADDRESS (uint64 or ULEB128)
// code address or address delta, depending on Flag
// INLINED FUNCTION RECORDS
// A list of NUM_INLINED_FUNCTIONS entries describing each of the
// inlined callees. Each record contains:
// INLINE SITE
// GUID of the inlinee (uint64)
// Index of the callsite probe (ULEB128)
// FUNCTION BODY
// A FUNCTION BODY entry describing the inlined function.
#ifndef NDEBUG
SectionName = "pseudo_probe";
#endif
Data = Start;
End = Data + Size;

PseudoProbeInlineTree *Root = &DummyInlineRoot;
PseudoProbeInlineTree *Cur = &DummyInlineRoot;
uint64_t LastAddr = 0;
uint32_t Index = 0;
// A DFS-based decoding
while (Data < End) {
// Read inline site for inlinees
if (Root != Cur) {
Index = readUnsignedNumber<uint32_t>();
}
// Switch/add to a new tree node(inlinee)
Cur = Cur->getOrAddNode({Cur->GUID, Index});
// Read guid
Cur->GUID = readUnencodedNumber<uint64_t>();
// Read number of probes in the current node.
uint32_t NodeCount = readUnsignedNumber<uint32_t>();
// Read number of direct inlinees
Cur->ChildrenToProcess = readUnsignedNumber<uint32_t>();
// Read all probes in this node
for (std::size_t I = 0; I < NodeCount; I++) {
// Read index
uint32_t Index = readUnsignedNumber<uint32_t>();
// Read type | flag.
uint8_t Value = readUnencodedNumber<uint8_t>();
uint8_t Kind = Value & 0xf;
uint8_t Attr = (Value & 0x70) >> 4;
// Read address
uint64_t Addr = 0;
if (Value & 0x80) {
int64_t Offset = readSignedNumber<int64_t>();
Addr = LastAddr + Offset;
} else {
Addr = readUnencodedNumber<int64_t>();
}
// Populate Address2ProbesMap
std::vector<PseudoProbe> &ProbeVec = Address2ProbesMap[Addr];
ProbeVec.emplace_back(Addr, Cur->GUID, Index, PseudoProbeType(Kind), Attr,
Cur);
Cur->addProbes(&ProbeVec.back());
LastAddr = Addr;
}

// Look for the parent for the next node by subtracting the current
// node count from tree counts along the parent chain. The first node
// in the chain that has a non-zero tree count is the target.
while (Cur != Root) {
if (Cur->ChildrenToProcess == 0) {
Cur = Cur->Parent;
if (Cur != Root) {
assert(Cur->ChildrenToProcess > 0 &&
"Should have some unprocessed nodes");
Cur->ChildrenToProcess -= 1;
}
} else {
break;
}
}
}

assert(Data == End && "Have unprocessed data in pseudo_probe section");
assert(Cur == Root &&
" Cur should point to root when the forest is fully built up");
}

void PseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
OS << "Pseudo Probe Desc:\n";
// Make the output deterministic
std::map<uint64_t, PseudoProbeFuncDesc> OrderedMap(GUID2FuncDescMap.begin(),
GUID2FuncDescMap.end());
for (auto &I : OrderedMap) {
I.second.print(OS);
}
}

void PseudoProbeDecoder::printProbeForAddress(raw_ostream &OS,
uint64_t Address) {
auto It = Address2ProbesMap.find(Address);
if (It != Address2ProbesMap.end()) {
for (auto &Probe : It->second) {
OS << " [Probe]:\t";
Probe.print(OS, GUID2FuncDescMap, true);
}
}
}

const PseudoProbe *
PseudoProbeDecoder::getCallProbeForAddr(uint64_t Address) const {
auto It = Address2ProbesMap.find(Address);
if (It == Address2ProbesMap.end())
return nullptr;
const std::vector<PseudoProbe> &Probes = It->second;

const PseudoProbe *CallProbe = nullptr;
for (const auto &Probe : Probes) {
if (Probe.isCall()) {
assert(!CallProbe &&
"There should be only one call probe corresponding to address "
"which is a callsite.");
CallProbe = &Probe;
}
}
return CallProbe;
}

void PseudoProbeDecoder::getInlineContextForProbe(
const PseudoProbe *Probe, SmallVector<std::string, 16> &InlineContextStack,
bool IncludeLeaf) const {
if (IncludeLeaf) {
// Note that the context from probe doesn't include leaf frame,
// hence we need to retrieve and prepend leaf if requested.
auto It = GUID2FuncDescMap.find(Probe->GUID);
assert(It != GUID2FuncDescMap.end() &&
"Should have function descriptor for a valid GUID");
StringRef FuncName = It->second.FuncName;
// InlineContextStack is in callee-caller order, so push leaf in the front
InlineContextStack.emplace_back(FuncName.str() + ":" +
Twine(Probe->Index).str());
}

Probe->getInlineContext(InlineContextStack, GUID2FuncDescMap, true);
}

} // end namespace sampleprof
} // end namespace llvm
224 changes: 224 additions & 0 deletions llvm/tools/llvm-profgen/PseudoProbe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
//===--- PseudoProbe.h - Pseudo probe decoding utilities ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TOOLS_LLVM_PROFGEN_PSEUDOPROBE_H
#define LLVM_TOOLS_LLVM_PROFGEN_PSEUDOPROBE_H

#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/IR/PseudoProbe.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/IPO/SampleProfileProbe.h"
#include <algorithm>
#include <set>
#include <sstream>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>

namespace llvm {
namespace sampleprof {

enum PseudoProbeAttributes { TAILCALL = 1, DANGLING = 2 };

// Use func GUID and index as the location info of the inline site
using InlineSite = std::tuple<uint64_t, uint32_t>;

struct PseudoProbe;

// Tree node to represent the inline relation and its inline site, we use a
// dummy root in the PseudoProbeDecoder to lead the tree, the outlined
// function will directly be the children of the dummy root. For the inlined
// function, all the inlinee will be connected to its inlineer, then further to
// its outlined function. Pseudo probes originating from the function stores the
// tree's leaf node which we can process backwards to get its inline context
class PseudoProbeInlineTree {
std::vector<PseudoProbe *> ProbeVector;

struct InlineSiteHash {
uint64_t operator()(const InlineSite &Site) const {
return std::get<0>(Site) ^ std::get<1>(Site);
}
};
std::unordered_map<InlineSite, std::unique_ptr<PseudoProbeInlineTree>,
InlineSiteHash>
Children;

public:
// Inlinee function GUID
uint64_t GUID = 0;
// Inline site to indicate the location in its inliner. As the node could also
// be an outlined function, it will use a dummy InlineSite whose GUID and
// Index is 0 connected to the dummy root
InlineSite ISite;
// Used for decoding
uint32_t ChildrenToProcess = 0;
// Caller node of the inline site
PseudoProbeInlineTree *Parent;

PseudoProbeInlineTree(){};
PseudoProbeInlineTree(const InlineSite &Site) : ISite(Site){};

PseudoProbeInlineTree *getOrAddNode(const InlineSite &Site) {
auto Ret =
Children.emplace(Site, std::make_unique<PseudoProbeInlineTree>(Site));
Ret.first->second->Parent = this;
return Ret.first->second.get();
}

void addProbes(PseudoProbe *Probe) { ProbeVector.push_back(Probe); }
// Return false if it's a dummy inline site
bool hasInlineSite() const { return !std::get<0>(ISite); }
};

// Function descriptor decoded from .pseudo_probe_desc section
struct PseudoProbeFuncDesc {
uint64_t FuncGUID = 0;
uint64_t FuncHash = 0;
std::string FuncName;

PseudoProbeFuncDesc(uint64_t GUID, uint64_t Hash, StringRef Name)
: FuncGUID(GUID), FuncHash(Hash), FuncName(Name){};

void print(raw_ostream &OS);
};

// GUID to PseudoProbeFuncDesc map
using GUIDProbeFunctionMap = std::unordered_map<uint64_t, PseudoProbeFuncDesc>;
// Address to pseudo probes map.
using AddressProbesMap = std::unordered_map<uint64_t, std::vector<PseudoProbe>>;

/*
A pseudo probe has the format like below:
INDEX (ULEB128)
TYPE (uint4)
0 - block probe, 1 - indirect call, 2 - direct call
ATTRIBUTE (uint3)
1 - tail call, 2 - dangling
ADDRESS_TYPE (uint1)
0 - code address, 1 - address delta
CODE_ADDRESS (uint64 or ULEB128)
code address or address delta, depending on Flag
*/
struct PseudoProbe {
uint64_t Address;
uint64_t GUID;
uint32_t Index;
PseudoProbeType Type;
uint8_t Attribute;
PseudoProbeInlineTree *InlineTree;
const static uint32_t PseudoProbeFirstId =
static_cast<uint32_t>(PseudoProbeReservedId::Last) + 1;

PseudoProbe(uint64_t Ad, uint64_t G, uint32_t I, PseudoProbeType K,
uint8_t At, PseudoProbeInlineTree *Tree)
: Address(Ad), GUID(G), Index(I), Type(K), Attribute(At),
InlineTree(Tree){};

bool isEntry() const { return Index == PseudoProbeFirstId; }

bool isDangling() const {
return Attribute & static_cast<uint8_t>(PseudoProbeAttributes::DANGLING);
}

bool isTailCall() const {
return Attribute & static_cast<uint8_t>(PseudoProbeAttributes::TAILCALL);
}

bool isBlock() const { return Type == PseudoProbeType::Block; }
bool isIndirectCall() const { return Type == PseudoProbeType::IndirectCall; }
bool isDirectCall() const { return Type == PseudoProbeType::DirectCall; }
bool isCall() const { return isIndirectCall() || isDirectCall(); }

// Get the inlined context by traversing current inline tree backwards,
// each tree node has its InlineSite which is taken as the context.
// \p ContextStack is populated in root to leaf order
void getInlineContext(SmallVector<std::string, 16> &ContextStack,
const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName) const;
// Helper function to get the string from context stack
std::string getInlineContextStr(const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName) const;
// Print pseudo probe while disassembling
void print(raw_ostream &OS, const GUIDProbeFunctionMap &GUID2FuncMAP,
bool ShowName);
};

/*
Decode pseudo probe info from ELF section, used along with ELF reader
Two sections are decoded here:
1) \fn buildGUID2FunctionMap is responsible for .pseudo_probe_desc
section which encodes all function descriptors.
2) \fn buildAddress2ProbeMap is responsible for .pseudoprobe section
which encodes an inline function forest and each tree includes its
inlined function and all pseudo probes inside the function.
see \file MCPseudoProbe.h for the details of the section encoding format.
*/
class PseudoProbeDecoder {
// GUID to PseudoProbeFuncDesc map.
GUIDProbeFunctionMap GUID2FuncDescMap;

// Address to probes map.
AddressProbesMap Address2ProbesMap;

// The dummy root of the inline trie, all the outlined function will directly
// be the children of the dummy root, all the inlined function will be the
// children of its inlineer. So the relation would be like:
// DummyRoot --> OutlinedFunc --> InlinedFunc1 --> InlinedFunc2
PseudoProbeInlineTree DummyInlineRoot;

/// Points to the current location in the buffer.
const uint8_t *Data = nullptr;

/// Points to the end of the buffer.
const uint8_t *End = nullptr;

#ifndef NDEBUG
/// SectionName used for debug
std::string SectionName;
#endif

// Decoding helper function
template <typename T> T readUnencodedNumber();
template <typename T> T readUnsignedNumber();
template <typename T> T readSignedNumber();
StringRef readString(uint32_t Size);

public:
// Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map.
void buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size);

// Decode pseudo_probe section to build address to probes map.
void buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);

// Print pseudo_probe_desc section info
void printGUID2FuncDescMap(raw_ostream &OS);

// Print pseudo_probe section info, used along with show-disassembly
void printProbeForAddress(raw_ostream &OS, uint64_t Address);

// Look up the probe of a call for the input address
const PseudoProbe *getCallProbeForAddr(uint64_t Address) const;

// Helper function to populate one probe's inline stack into
// \p InlineContextStack.
// Current leaf location info will be added if IncludeLeaf is true
// Example:
// Current probe(bar:3) inlined at foo:2 then inlined at main:1
// IncludeLeaf = true, Output: [main:1, foo:2, bar:3]
// IncludeLeaf = false, OUtput: [main:1, foo:2]
void
getInlineContextForProbe(const PseudoProbe *Probe,
SmallVector<std::string, 16> &InlineContextStack,
bool IncludeLeaf) const;
};

} // end namespace sampleprof
} // end namespace llvm

#endif