Skip to content

Commit

Permalink
[BOLT] Add support for dumping profile on MacOS
Browse files Browse the repository at this point in the history
Summary: Add support for dumping profile on MacOS.

(cherry picked from FBD25751363)
  • Loading branch information
Alexander Shaposhnikov authored and maksfb committed Jan 28, 2021
1 parent 3b876cc commit a0dd5b0
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 45 deletions.
28 changes: 17 additions & 11 deletions bolt/runtime/common.h
Expand Up @@ -17,6 +17,7 @@ typedef __SSIZE_TYPE__ ssize_t;

typedef unsigned long long uint64_t;
typedef unsigned uint32_t;
typedef unsigned char uint8_t;

typedef long long int64_t;
typedef int int32_t;
Expand Down Expand Up @@ -127,6 +128,21 @@ uint64_t __munmap(void *addr, uint64_t size) {
return ret;
}

uint64_t __exit(uint64_t code) {
#if defined(__APPLE__)
#define EXIT_SYSCALL 0x2000001
#else
#define EXIT_SYSCALL 231
#endif
uint64_t ret;
__asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(code)
: "cc", "rcx", "r11", "memory");
return ret;
}

// Helper functions for writing strings to the .fdata file. We intentionally
// avoid using libc names (lowercase memset) to make it clear it is our impl.

Expand Down Expand Up @@ -317,15 +333,7 @@ uint64_t __getppid() {
return ret;
}

uint64_t __exit(uint64_t code) {
uint64_t ret;
__asm__ __volatile__("movq $231, %%rax\n"
"syscall\n"
: "=a"(ret)
: "D"(code)
: "cc", "rcx", "r11", "memory");
return ret;
}
#endif

void reportError(const char *Msg, uint64_t Size) {
__write(2, Msg, Size);
Expand Down Expand Up @@ -372,6 +380,4 @@ inline uint64_t alignTo(uint64_t Value, uint64_t Align) {
return (Value + Align - 1) / Align * Align;
}

#endif

} // anonymous namespace
106 changes: 74 additions & 32 deletions bolt/runtime/instr.cpp
Expand Up @@ -60,10 +60,11 @@

#if defined(__APPLE__)
extern "C" {

extern uint64_t* _bolt_instr_locations_getter();
extern uint32_t _bolt_num_counters_getter();

extern uint8_t* _bolt_instr_tables_getter();
extern uint32_t _bolt_instr_num_funcs_getter();
}

#else
Expand Down Expand Up @@ -106,6 +107,8 @@ extern void (*__bolt_trampoline_ind_tailcall)();
extern void (*__bolt_instr_init_ptr)();
extern void (*__bolt_instr_fini_ptr)();

#endif

namespace {

/// A simple allocator that mmaps a fixed size region and manages this space
Expand All @@ -124,14 +127,21 @@ class BumpPtrAllocator {
public:
void *allocate(size_t Size) {
Lock L(M);

if (StackBase == nullptr) {
#if defined(__APPLE__)
int MAP_PRIVATE_MAP_ANONYMOUS = 0x1002;
#else
int MAP_PRIVATE_MAP_ANONYMOUS = 0x22;
#endif
StackBase = reinterpret_cast<uint8_t *>(
__mmap(0, MaxSize, 0x3 /* PROT_READ | PROT_WRITE*/,
Shared ? 0x21 /*MAP_SHARED | MAP_ANONYMOUS*/
: 0x22 /* MAP_PRIVATE | MAP_ANONYMOUS*/,
: MAP_PRIVATE_MAP_ANONYMOUS /* MAP_PRIVATE | MAP_ANONYMOUS*/,
-1, 0));
StackSize = 0;
}

Size = alignTo(Size + sizeof(EntryMetadata), 16);
uint8_t *AllocAddress = StackBase + StackSize + sizeof(EntryMetadata);
auto *M = reinterpret_cast<EntryMetadata *>(StackBase + StackSize);
Expand Down Expand Up @@ -555,7 +565,7 @@ FunctionDescription::FunctionDescription(const uint8_t *FuncDesc) {

/// Read and mmap descriptions written by BOLT from the executable's notes
/// section
#ifdef HAVE_ELF_H
#if defined(HAVE_ELF_H) and !defined(__APPLE__)
ProfileWriterContext readDescriptions() {
ProfileWriterContext Result;
uint64_t FD = __open("/proc/self/exe",
Expand Down Expand Up @@ -614,16 +624,31 @@ ProfileWriterContext readDescriptions() {
reportError(ErrMsg, sizeof(ErrMsg));
return Result;
}

#else

ProfileWriterContext readDescriptions() {
ProfileWriterContext Result;
const char ErrMsg[] =
"BOLT instrumentation runtime error: unsupported binary format.\n";
reportError(ErrMsg, sizeof(ErrMsg));
uint8_t *Tables = _bolt_instr_tables_getter();
uint32_t IndCallDescSize = *reinterpret_cast<uint32_t *>(Tables);
uint32_t IndCallTargetDescSize =
*reinterpret_cast<uint32_t *>(Tables + 4 + IndCallDescSize);
uint32_t FuncDescSize = *reinterpret_cast<uint32_t *>(
Tables + 8 + IndCallDescSize + IndCallTargetDescSize);
Result.IndCallDescriptions =
reinterpret_cast<IndCallDescription *>(Tables + 4);
Result.IndCallTargets = reinterpret_cast<IndCallTargetDescription *>(
Tables + 8 + IndCallDescSize);
Result.FuncDescriptions =
Tables + 12 + IndCallDescSize + IndCallTargetDescSize;
Result.Strings = reinterpret_cast<char *>(
Tables + 12 + IndCallDescSize + IndCallTargetDescSize + FuncDescSize);
return Result;
}

#endif

#if !defined(__APPLE__)
/// Debug by printing overall metadata global numbers to check it is sane
void printStats(const ProfileWriterContext &Ctx) {
char StatMsg[BufSize];
Expand All @@ -646,6 +671,8 @@ void printStats(const ProfileWriterContext &Ctx) {
StatPtr = strCopy(StatPtr, "\n");
__write(2, StatMsg, StatPtr - StatMsg);
}
#endif


/// This is part of a simple CFG representation in memory, where we store
/// a dynamically sized array of input and output edges per node, and store
Expand Down Expand Up @@ -708,6 +735,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,
if (static_cast<int32_t>(D.Edges[I].ToNode) > MaxNodes)
MaxNodes = D.Edges[I].ToNode;
}

for (int I = 0; I < D.NumLeafNodes; ++I) {
if (static_cast<int32_t>(D.LeafNodes[I].Node) > MaxNodes)
MaxNodes = D.LeafNodes[I].Node;
Expand All @@ -730,6 +758,7 @@ Graph::Graph(BumpPtrAllocator &Alloc, const FunctionDescription &D,

// Initial allocations
CFGNodes = new (Alloc) Node[MaxNodes];

DEBUG(reportNumber("G->CFGNodes = 0x", (uint64_t)CFGNodes, 16));
SpanningTreeNodes = new (Alloc) Node[MaxNodes];
DEBUG(reportNumber("G->SpanningTreeNodes = 0x",
Expand Down Expand Up @@ -1104,34 +1133,41 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
const FunctionDescription F(FuncDesc);
const uint8_t *next = FuncDesc + F.getSize();

#if !defined(__APPLE__)
uint64_t *bolt_instr_locations = __bolt_instr_locations;
#else
uint64_t *bolt_instr_locations = _bolt_instr_locations_getter();
#endif

// Skip funcs we know are cold
#ifndef ENABLE_DEBUG
uint64_t CountersFreq = 0;
for (int I = 0; I < F.NumLeafNodes; ++I) {
CountersFreq += __bolt_instr_locations[F.LeafNodes[I].Counter];
CountersFreq += bolt_instr_locations[F.LeafNodes[I].Counter];
}
if (CountersFreq == 0) {
for (int I = 0; I < F.NumEdges; ++I) {
const uint32_t C = F.Edges[I].Counter;
if (C == 0xffffffff)
continue;
CountersFreq += __bolt_instr_locations[C];
CountersFreq += bolt_instr_locations[C];
}
if (CountersFreq == 0) {
for (int I = 0; I < F.NumCalls; ++I) {
const uint32_t C = F.Calls[I].Counter;
if (C == 0xffffffff)
continue;
CountersFreq += __bolt_instr_locations[C];
CountersFreq += bolt_instr_locations[C];
}
if (CountersFreq == 0)
return next;
}
}
#endif

Graph *G = new (Alloc) Graph(Alloc, F, __bolt_instr_locations, Ctx);
Graph *G = new (Alloc) Graph(Alloc, F, bolt_instr_locations, Ctx);
DEBUG(G->dump());

if (!G->EdgeFreqs && !G->CallFreqs) {
G->~Graph();
Alloc.deallocate(G);
Expand Down Expand Up @@ -1173,6 +1209,7 @@ const uint8_t *writeFunctionProfile(int FD, ProfileWriterContext &Ctx,
return next;
}

#if !defined(__APPLE__)
const IndCallTargetDescription *
ProfileWriterContext::lookupIndCallTarget(uint64_t Target) const {
uint32_t B = 0;
Expand Down Expand Up @@ -1293,8 +1330,13 @@ int openProfile() {
}
return FD;
}

#endif

} // anonymous namespace

#if !defined(__APPLE__)

/// Reset all counters in case you want to start profiling a new phase of your
/// program independently of prior phases.
/// The address of this function is printed by BOLT and this can be called by
Expand Down Expand Up @@ -1476,42 +1518,42 @@ extern "C" void __bolt_instr_fini() {

#if defined(__APPLE__)

extern "C" void __bolt_instr_data_dump() {
ProfileWriterContext Ctx = readDescriptions();

int FD = 2;
BumpPtrAllocator Alloc;
const uint8_t *FuncDesc = Ctx.FuncDescriptions;
uint32_t bolt_instr_num_funcs = _bolt_instr_num_funcs_getter();

for (int I = 0, E = bolt_instr_num_funcs; I < E; ++I) {
FuncDesc = writeFunctionProfile(FD, Ctx, FuncDesc, Alloc);
Alloc.clear();
DEBUG(reportNumber("FuncDesc now: ", (uint64_t)FuncDesc, 16));
}
assert(FuncDesc == (void *)Ctx.Strings,
"FuncDesc ptr must be equal to stringtable");
}

// On OSX/iOS the final symbol name of an extern "C" function/variable contains
// one extra leading underscore: _bolt_instr_setup -> __bolt_instr_setup.
extern "C"
__attribute__((section("__TEXT,__setup")))
__attribute__((force_align_arg_pointer))
void _bolt_instr_setup() {
const char *Message = "Hello!\n";
__write(2, Message, 7);

uint32_t NumCounters = _bolt_num_counters_getter();
reportNumber("__bolt_instr_setup, number of counters: ", NumCounters, 10);
__asm__ __volatile__(SAVE_ALL :::);

uint64_t *Locs = _bolt_instr_locations_getter();
reportNumber("__bolt_instr_setup, address of counters: ",
reinterpret_cast<uint64_t>(Locs), 10);
report("Hello!\n");

for (size_t I = 0; I < NumCounters; ++I)
reportNumber("Counter value: ", Locs[I], 10);
__asm__ __volatile__(RESTORE_ALL :::);
}

extern "C"
__attribute__((section("__TEXT,__fini")))
__attribute__((force_align_arg_pointer))
void _bolt_instr_fini() {
uint32_t NumCounters = _bolt_num_counters_getter();
reportNumber("__bolt_instr_fini, number of counters: ", NumCounters, 10);

uint64_t *Locs = _bolt_instr_locations_getter();
reportNumber("__bolt_instr_fini, address of counters: ",
reinterpret_cast<uint64_t>(Locs), 10);

for (size_t I = 0; I < NumCounters; ++I)
reportNumber("Counter value: ", Locs[I], 10);

const char *Message = "Bye!\n";
__write(2, Message, 5);
report("Bye!\n");
__bolt_instr_data_dump();
}

#endif
10 changes: 10 additions & 0 deletions bolt/src/MCPlusBuilder.h
Expand Up @@ -1748,6 +1748,16 @@ class MCPlusBuilder {
return {};
}

virtual std::vector<MCInst> createInstrTablesGetter(MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}

virtual std::vector<MCInst> createInstrNumFuncsGetter(MCContext *Ctx) const {
llvm_unreachable("not implemented");
return {};
}

/// This method takes an indirect call instruction and splits it up into an
/// equivalent set of instructions that use direct calls for target
/// symbols/addresses that are contained in the Targets vector. This is done
Expand Down
2 changes: 2 additions & 0 deletions bolt/src/MachORewriteInstance.cpp
Expand Up @@ -424,6 +424,7 @@ void MachORewriteInstance::emitAndLink() {
if (Key == K) {
mapCodeSections(Key);
mapInstrumentationSection(Key, "__counters");
mapInstrumentationSection(Key, "__tables");
} else {
// TODO: Refactor addRuntimeLibSections to work properly on Mach-O
// and use it here.
Expand Down Expand Up @@ -494,6 +495,7 @@ void MachORewriteInstance::rewriteFile() {
}

writeInstrumentationSection("__counters", OS);
writeInstrumentationSection("__tables", OS);

// TODO: Refactor addRuntimeLibSections to work properly on Mach-O and
// use it here.
Expand Down
4 changes: 4 additions & 0 deletions bolt/src/Passes/Instrumentation.cpp
Expand Up @@ -612,6 +612,10 @@ void Instrumentation::createAuxiliaryFunctions(BinaryContext &BC) {
BC.MIB->createNumCountersGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_locations_getter",
BC.MIB->createInstrLocationsGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_tables_getter",
BC.MIB->createInstrTablesGetter(BC.Ctx.get()));
createSimpleFunction("__bolt_instr_num_funcs_getter",
BC.MIB->createInstrNumFuncsGetter(BC.Ctx.get()));
}

void Instrumentation::setupRuntimeLibrary(BinaryContext &BC) {
Expand Down

0 comments on commit a0dd5b0

Please sign in to comment.