diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h index fe8ddcdf79129..0fc4883305145 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h @@ -122,6 +122,12 @@ class ProfileWriter { virtual void startContextSection() = 0; virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0; virtual void endContextSection() = 0; + + virtual void startFlatSection() = 0; + virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer, + size_t BufferSize) = 0; + virtual void endFlatSection() = 0; + virtual ~ProfileWriter() = default; }; } // namespace ctx_profile diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index 992aa94a6631d..d7ec8fde4ec7d 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -8,6 +8,8 @@ #include "CtxInstrProfiling.h" #include "sanitizer_common/sanitizer_allocator_internal.h" +#include "sanitizer_common/sanitizer_atomic.h" +#include "sanitizer_common/sanitizer_atomic_clang.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_dense_map.h" #include "sanitizer_common/sanitizer_libc.h" @@ -27,6 +29,20 @@ __sanitizer::SpinMutex AllContextsMutex; SANITIZER_GUARDED_BY(AllContextsMutex) __sanitizer::Vector AllContextRoots; +__sanitizer::atomic_uintptr_t AllFunctionsData = {}; + +// Keep all the functions for which we collect a flat profile in a linked list. +__sanitizer::SpinMutex FlatCtxArenaMutex; +SANITIZER_GUARDED_BY(FlatCtxArenaMutex) +Arena *FlatCtxArenaHead = nullptr; +SANITIZER_GUARDED_BY(FlatCtxArenaMutex) +Arena *FlatCtxArena = nullptr; + +// Set to true when we enter a root, and false when we exit - regardless if this +// thread collects a contextual profile for that root. +__thread bool IsUnderContext = false; +__sanitizer::atomic_uint8_t ProfilingStarted = {}; + // utility to taint a pointer by setting the LSB. There is an assumption // throughout that the addresses of contexts are even (really, they should be // align(8), but "even"-ness is the minimum assumption) @@ -109,7 +125,10 @@ void resetContextNode(ContextNode &Node) { resetContextNode(*Next); } -void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; } +ContextNode *onContextEnter(ContextNode &Node) { + ++Node.counters()[0]; + return &Node; +} } // namespace @@ -182,12 +201,75 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, return Ret; } -ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, - uint32_t NumCounters, +ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, + uint32_t NumCounters) { + if (ContextNode *Existing = Data.FlatCtx) + return Existing; + { + // We could instead try to take the lock and, if that fails, return + // TheScratchContext. But that could leave message pump loops more sparsely + // profiled than everything else. Maybe that doesn't matter, and we can + // optimize this later. + __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex); + if (ContextNode *Existing = Data.FlatCtx) + return Existing; + + auto NeededSize = ContextNode::getAllocSize(NumCounters, 0); + char *AllocBuff = nullptr; + { + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL( + &FlatCtxArenaMutex); + if (FlatCtxArena) + AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); + if (!AllocBuff) { + FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize), + FlatCtxArena); + AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); + } + if (!FlatCtxArenaHead) + FlatCtxArenaHead = FlatCtxArena; + } + auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0); + Data.FlatCtx = Ret; + + Data.Next = reinterpret_cast( + __sanitizer::atomic_load_relaxed(&AllFunctionsData)); + while (!__sanitizer::atomic_compare_exchange_strong( + &AllFunctionsData, reinterpret_cast(&Data.Next), + reinterpret_cast(&Data), + __sanitizer::memory_order_release)) { + } + } + + return Data.FlatCtx; +} + +ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, + uint32_t NumCounters) { + // 1) if we are under a root (regardless if this thread is collecting or not a + // contextual profile for that root), do not collect a flat profile. We want + // to keep flat profiles only for activations that can't happen under a root, + // to avoid confusing profiles. We can, for example, combine flattened and + // flat profiles meaningfully, as we wouldn't double-count anything. + // + // 2) to avoid lengthy startup, don't bother with flat profiles until the + // profiling started. We would reset them anyway when profiling starts. + // HOWEVER. This does lose profiling for message pumps: those functions are + // entered once and never exit. They should be assumed to be entered before + // profiling starts - because profiling should start after the server is up + // and running (which is equivalent to "message pumps are set up"). + if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted)) + return TheScratchContext; + return markAsScratch( + onContextEnter(*getFlatProfile(Data, Guid, NumCounters))); +} + +ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, + GUID Guid, uint32_t NumCounters, uint32_t NumCallsites) { // fast "out" if we're not even doing contextual collection. if (!__llvm_ctx_profile_current_context_root) - return TheScratchContext; + return getUnhandledContext(*Data, Guid, NumCounters); // also fast "out" if the caller is scratch. We can see if it's scratch by // looking at the interior pointer into the subcontexts vector that the caller @@ -196,7 +278,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, // precisely, aligned - 8 values) auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); if (!CallsiteContext || isScratch(CallsiteContext)) - return TheScratchContext; + return getUnhandledContext(*Data, Guid, NumCounters); // if the callee isn't the expected one, return scratch. // Signal handler(s) could have been invoked at any point in the execution. @@ -214,7 +296,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, // for that case. auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); if (ExpectedCallee != Callee) - return TheScratchContext; + return getUnhandledContext(*Data, Guid, NumCounters); auto *Callsite = *CallsiteContext; // in the case of indirect calls, we will have all seen targets forming a @@ -257,6 +339,7 @@ void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, ContextNode *__llvm_ctx_profile_start_context( ContextRoot *Root, GUID Guid, uint32_t Counters, uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + IsUnderContext = true; if (!Root->FirstMemBlock) { setupContext(Root, Guid, Counters, Callsites); } @@ -272,6 +355,7 @@ ContextNode *__llvm_ctx_profile_start_context( void __llvm_ctx_profile_release_context(ContextRoot *Root) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + IsUnderContext = false; if (__llvm_ctx_profile_current_context_root) { __llvm_ctx_profile_current_context_root = nullptr; Root->Taken.Unlock(); @@ -291,10 +375,12 @@ void __llvm_ctx_profile_start_collection() { resetContextNode(*Root->FirstNode); } + __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); } bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { + __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); @@ -310,17 +396,43 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { Writer.writeContextual(*Root->FirstNode); } Writer.endContextSection(); + Writer.startFlatSection(); + // The list progresses behind the head, so taking this snapshot allows the + // list to grow concurrently without causing a race condition with our + // traversing it. + const auto *Pos = reinterpret_cast( + __sanitizer::atomic_load_relaxed(&AllFunctionsData)); + for (; Pos; Pos = Pos->Next) + Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(), + Pos->FlatCtx->counters_size()); + Writer.endFlatSection(); return true; } void __llvm_ctx_profile_free() { - __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( - &AllContextsMutex); - for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) - for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { + __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); + { + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &AllContextsMutex); + for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) + for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { + auto *C = A; + A = A->next(); + __sanitizer::InternalFree(C); + } + AllContextRoots.Reset(); + } + __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U); + { + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( + &FlatCtxArenaMutex); + FlatCtxArena = nullptr; + for (auto *A = FlatCtxArenaHead; A;) { auto *C = A; - A = A->next(); + A = C->next(); __sanitizer::InternalFree(C); } - AllContextRoots.Reset(); + + FlatCtxArenaHead = nullptr; + } } diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h index 8a6949d4ec288..c41a77457178c 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -113,6 +113,28 @@ struct ContextRoot { static_assert(sizeof(Taken) == 1); }; +// This is allocated and zero-initialized by the compiler, the in-place +// initialization serves mostly as self-documentation and for testing. +// The design is influenced by the observation that typically (at least for +// datacenter binaries, which is the motivating target of this profiler) less +// than 10% of functions in a binary even appear in a profile (of any kind). +// +// 1) We could pre-allocate the flat profile storage in the compiler, just like +// the flat instrumented profiling does. But that penalizes the static size of +// the binary for little reason +// +// 2) We could do the above but zero-initialize the buffers (which should place +// them in .bss), and dynamically populate them. This, though, would page-in +// more memory upfront for the binary's runtime +// +// The current design trades off a bit of overhead at the first time a function +// is encountered *for flat profiling* for avoiding size penalties. +struct FunctionData { + FunctionData *Next = nullptr; + ContextNode *volatile FlatCtx = nullptr; + ::__sanitizer::StaticSpinMutex Mutex; +}; + /// This API is exposed for testing. See the APIs below about the contract with /// LLVM. inline bool isScratch(const void *Ctx) { @@ -152,7 +174,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); /// called for any other function than entry points, in the entry BB of such /// function. Same consideration about LSB of returned value as .._start_context -ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, +ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *Data, + void *Callee, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites); diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp index 97292f9f1abff..01a8274774ecb 100644 --- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp +++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp @@ -40,6 +40,7 @@ TEST(ArenaTest, Basic) { } TEST_F(ContextTest, Basic) { + __llvm_ctx_profile_start_collection(); auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); ASSERT_NE(Ctx, nullptr); EXPECT_NE(Root.CurrentMem, nullptr); @@ -58,6 +59,7 @@ TEST_F(ContextTest, Basic) { } TEST_F(ContextTest, Callsite) { + __llvm_ctx_profile_start_collection(); auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); int FakeCalleeAddress = 0; const bool IsScratch = isScratch(Ctx); @@ -67,7 +69,11 @@ TEST_F(ContextTest, Callsite) { __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; // This is what the callee does - auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + FunctionData FData = {0}; + auto *Subctx = + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1); + // This should not have required creating a flat context. + EXPECT_EQ(FData.FlatCtx, nullptr); // We expect the subcontext to be appropriately placed and dimensioned EXPECT_EQ(Ctx->subContexts()[2], Subctx); EXPECT_EQ(Subctx->counters_size(), 3U); @@ -81,29 +87,59 @@ TEST_F(ContextTest, Callsite) { __llvm_ctx_profile_release_context(&Root); } -TEST_F(ContextTest, ScratchNoCollection) { +TEST_F(ContextTest, ScratchNoCollectionProfilingNotStarted) { + // This test intentionally does not call __llvm_ctx_profile_start_collection. EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr); int FakeCalleeAddress = 0; // this would be the very first function executing this. the TLS is empty, // too. - auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + FunctionData FData = {0}; + auto *Ctx = + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1); + // We never entered a context (_start_context was never called) - so the + // returned context must be a tagged pointer. + EXPECT_TRUE(isScratch(Ctx)); + // Because we didn't start collection, no flat profile should have been + // allocated. + EXPECT_EQ(FData.FlatCtx, nullptr); +} + +TEST_F(ContextTest, ScratchNoCollectionProfilingStarted) { + ASSERT_EQ(__llvm_ctx_profile_current_context_root, nullptr); + int FakeCalleeAddress = 0; + // Start collection, so the function gets a flat profile instead of scratch. + __llvm_ctx_profile_start_collection(); + // this would be the very first function executing this. the TLS is empty, + // too. + FunctionData FData = {0}; + auto *Ctx = + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1); // We never entered a context (_start_context was never called) - so the - // returned context must be scratch. + // returned context must be a tagged pointer. EXPECT_TRUE(isScratch(Ctx)); + // Because we never entered a context, we should have allocated a flat context + EXPECT_NE(FData.FlatCtx, nullptr); + EXPECT_EQ(reinterpret_cast(FData.FlatCtx) + 1, + reinterpret_cast(Ctx)); } TEST_F(ContextTest, ScratchDuringCollection) { + __llvm_ctx_profile_start_collection(); auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); int FakeCalleeAddress = 0; int OtherFakeCalleeAddress = 0; __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; - auto *Subctx = - __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1); + FunctionData FData[3] = {0}; + auto *Subctx = __llvm_ctx_profile_get_context( + &FData[0], &OtherFakeCalleeAddress, 2, 3, 1); // We expected a different callee - so return scratch. It mimics what happens // in the case of a signal handler - in this case, OtherFakeCalleeAddress is // the signal handler. EXPECT_TRUE(isScratch(Subctx)); + // We shouldn't have tried to return a flat context because we're under a + // root. + EXPECT_EQ(FData[0].FlatCtx, nullptr); EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr); EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr); @@ -111,24 +147,27 @@ TEST_F(ContextTest, ScratchDuringCollection) { __llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress; __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0]; - auto *Subctx2 = - __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0); + auto *Subctx2 = __llvm_ctx_profile_get_context( + &FData[1], &ThirdFakeCalleeAddress, 3, 0, 0); // We again expect scratch because the '0' position is where the runtime // looks, so it doesn't matter the '1' position is populated correctly. EXPECT_TRUE(isScratch(Subctx2)); + EXPECT_EQ(FData[1].FlatCtx, nullptr); __llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress; __llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0]; - auto *Subctx3 = - __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0); + auto *Subctx3 = __llvm_ctx_profile_get_context( + &FData[2], &ThirdFakeCalleeAddress, 3, 0, 0); // We expect scratch here, too, because the value placed in // __llvm_ctx_profile_callsite is scratch EXPECT_TRUE(isScratch(Subctx3)); + EXPECT_EQ(FData[2].FlatCtx, nullptr); __llvm_ctx_profile_release_context(&Root); } TEST_F(ContextTest, NeedMoreMemory) { + __llvm_ctx_profile_start_collection(); auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4); int FakeCalleeAddress = 0; const bool IsScratch = isScratch(Ctx); @@ -136,9 +175,11 @@ TEST_F(ContextTest, NeedMoreMemory) { const auto *CurrentMem = Root.CurrentMem; __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; + FunctionData FData = {0}; // Allocate a massive subcontext to force new arena allocation auto *Subctx = - __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1); + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 3, 1 << 20, 1); + EXPECT_EQ(FData.FlatCtx, nullptr); EXPECT_EQ(Ctx->subContexts()[2], Subctx); EXPECT_NE(CurrentMem, Root.CurrentMem); EXPECT_NE(Root.CurrentMem, nullptr); @@ -175,7 +216,9 @@ TEST_F(ContextTest, Dump) { int FakeCalleeAddress = 0; __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress; __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2]; - auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1); + FunctionData FData = {0}; + auto *Subctx = + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1); (void)Subctx; __llvm_ctx_profile_release_context(&Root); @@ -186,6 +229,9 @@ TEST_F(ContextTest, Dump) { int EnteredSectionCount = 0; int ExitedSectionCount = 0; + int EnteredFlatCount = 0; + int ExitedFlatCount = 0; + int FlatsWritten = 0; bool State = false; @@ -217,6 +263,16 @@ TEST_F(ContextTest, Dump) { EXPECT_EQ(EnteredSectionCount, 1); ++ExitedSectionCount; } + void startFlatSection() override { ++EnteredFlatCount; } + void writeFlat(GUID Guid, const uint64_t *Buffer, + size_t BufferSize) override { + ++FlatsWritten; + EXPECT_EQ(BufferSize, 3); + EXPECT_EQ(Buffer[0], 15U); + EXPECT_EQ(Buffer[1], 0U); + EXPECT_EQ(Buffer[2], 0U); + } + void endFlatSection() override { ++ExitedFlatCount; } }; TestProfileWriter W(&Root, 1); @@ -226,10 +282,17 @@ TEST_F(ContextTest, Dump) { // this resets all counters but not the internal structure. __llvm_ctx_profile_start_collection(); + auto *Flat = + __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1); + EXPECT_NE(FData.FlatCtx, nullptr); + FData.FlatCtx->counters()[0] = 15U; TestProfileWriter W2(&Root, 0); EXPECT_FALSE(W2.State); __llvm_ctx_profile_fetch(W2); EXPECT_TRUE(W2.State); EXPECT_EQ(W2.EnteredSectionCount, 1); EXPECT_EQ(W2.ExitedSectionCount, 1); + EXPECT_EQ(W2.EnteredFlatCount, 1); + EXPECT_EQ(W2.FlatsWritten, 1); + EXPECT_EQ(W2.ExitedFlatCount, 1); } diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp index cdf819cbefc3b..bf33b4423fd1f 100644 --- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp +++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp @@ -15,6 +15,7 @@ #include using namespace llvm::ctx_profile; +extern "C" void __llvm_ctx_profile_start_collection(); extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &); // avoid name mangling @@ -36,6 +37,15 @@ __attribute__((noinline)) void theRoot() { someFunction(I); } } + +__attribute__((noinline)) void flatFct() { + printf("flat check 1\n"); + someFunction(1); +#pragma nounroll + for (auto I = 0; I < 2; ++I) { + someFunction(I); + } +} } // Make sure the program actually ran correctly. @@ -43,6 +53,10 @@ __attribute__((noinline)) void theRoot() { // CHECK-NEXT: check odd // CHECK-NEXT: check even // CHECK-NEXT: check odd +// CHECK-NEXT: flat check 1 +// CHECK-NEXT: check odd +// CHECK-NEXT: check even +// CHECK-NEXT: check odd class TestProfileWriter : public ProfileWriter { void printProfile(const ContextNode &Node, const std::string &Indent, @@ -73,6 +87,22 @@ class TestProfileWriter : public ProfileWriter { void writeContextual(const ContextNode &RootNode) override { printProfile(RootNode, "", ""); } + + void startFlatSection() override { + std::cout << "Entered Flat Section" << std::endl; + } + + void writeFlat(GUID Guid, const uint64_t *Buffer, + size_t BufferSize) override { + std::cout << "Flat: " << Guid << " " << Buffer[0]; + for (size_t I = 1U; I < BufferSize; ++I) + std::cout << "," << Buffer[I]; + std::cout << std::endl; + }; + + void endFlatSection() override { + std::cout << "Exited Flat Section" << std::endl; + } }; // 8657661246551306189 is theRoot. We expect 2 callsites and 2 counters - one @@ -100,6 +130,11 @@ class TestProfileWriter : public ProfileWriter { // CHECK-NEXT: 2 counters and 2 callsites // CHECK-NEXT: Counter values: 2 1 // CHECK-NEXT: Exited Context Section +// CHECK-NEXT: Entered Flat Section +// CHECK-NEXT: Flat: 6759619411192316602 3,1 +// This is flatFct (guid: 14569438697463215220) +// CHECK-NEXT: Flat: 14569438697463215220 1,2 +// CHECK-NEXT: Exited Flat Section bool profileWriter() { TestProfileWriter W; @@ -107,7 +142,9 @@ bool profileWriter() { } int main(int argc, char **argv) { + __llvm_ctx_profile_start_collection(); theRoot(); + flatFct(); // This would be implemented in a specific RPC handler, but here we just call // it directly. return !profileWriter(); diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h index fe8ddcdf79129..0fc4883305145 100644 --- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h +++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h @@ -122,6 +122,12 @@ class ProfileWriter { virtual void startContextSection() = 0; virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0; virtual void endContextSection() = 0; + + virtual void startFlatSection() = 0; + virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer, + size_t BufferSize) = 0; + virtual void endFlatSection() = 0; + virtual ~ProfileWriter() = default; }; } // namespace ctx_profile diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h index 40f355f99eb53..c5a724d9a2142 100644 --- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h +++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h @@ -87,10 +87,10 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter { void writeContextual(const ctx_profile::ContextNode &RootNode) override; void endContextSection() override; - void startFlatSection(); + void startFlatSection() override; void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer, - size_t BufferSize); - void endFlatSection(); + size_t BufferSize) override; + void endFlatSection() override; // constants used in writing which a reader may find useful. static constexpr unsigned CodeLen = 2; diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp index aa6bee23ad5ff..ffc2aec77ff91 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp @@ -12,6 +12,7 @@ #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -53,6 +54,7 @@ class CtxInstrumentationLowerer final { ModuleAnalysisManager &MAM; Type *ContextNodeTy = nullptr; Type *ContextRootTy = nullptr; + Type *FunctionDataTy = nullptr; DenseMap ContextRootMap; Function *StartCtx = nullptr; @@ -120,6 +122,12 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, PointerTy, /*CurrentMem*/ SanitizerMutexType, /*Taken*/ }); + FunctionDataTy = + StructType::get(M.getContext(), { + PointerTy, /*FlatCtx*/ + SanitizerMutexType, /*Mutex*/ + }); + // The Context header. ContextNodeTy = StructType::get(M.getContext(), { I64Ty, /*Guid*/ @@ -163,7 +171,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M, GetCtx = cast( M.getOrInsertFunction(CompilerRtAPINames::GetCtx, FunctionType::get(PointerTy, - {PointerTy, /*Callee*/ + {PointerTy, /*FunctionData*/ + PointerTy, /*Callee*/ I64Ty, /*Guid*/ I32Ty, /*NumCounters*/ I32Ty}, /*NumCallsites*/ @@ -224,7 +233,6 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { assert(Mark->getIndex()->isZero()); IRBuilder<> Builder(Mark); - Guid = Builder.getInt64( AssignGUIDPass::getGUID(cast(*Mark->getNameValue()))); // The type of the context of this function is now knowable since we have @@ -248,9 +256,14 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) { ORE.emit( [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); }); } else { - Context = - Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NumCounters), - Builder.getInt32(NumCallsites)}); + // Make up a compact name, these names end up taking up a lot of space + // in the binary. + auto *FData = new GlobalVariable( + M, FunctionDataTy, false, GlobalVariable::InternalLinkage, + Constant::getNullValue(FunctionDataTy)); + Context = Builder.CreateCall(GetCtx, {FData, &F, Guid, + Builder.getInt32(NumCounters), + Builder.getInt32(NumCallsites)}); ORE.emit([&] { return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F); }); diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll index 1927060de868e..e4a5ebdc818e6 100644 --- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll +++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll @@ -13,6 +13,11 @@ declare void @bar() ; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer ; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr ; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr +; LOWERING: @[[GLOB0:[0-9]+]] = internal global { ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB1:[0-9]+]] = internal global { ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB2:[0-9]+]] = internal global { ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB3:[0-9]+]] = internal global { ptr, i8 } zeroinitializer +; LOWERING: @[[GLOB4:[0-9]+]] = internal global { ptr, i8 } zeroinitializer ;. define void @foo(i32 %a, ptr %fct) { ; INSTRUMENT-LABEL: define void @foo( @@ -34,7 +39,7 @@ define void @foo(i32 %a, ptr %fct) { ; ; LOWERING-LABEL: define void @foo( ; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) !guid [[META0:![0-9]+]] { -; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2) +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB0]], ptr @foo, i64 6699318081062747564, i32 2, i32 2) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 ; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee) @@ -176,7 +181,7 @@ define void @simple(i32 %a) { ; ; LOWERING-LABEL: define void @simple( ; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META3:![0-9]+]] { -; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 1, i32 0) +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB1]], ptr @simple, i64 -3006003237940970099, i32 1, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 ; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr @@ -200,7 +205,7 @@ define i32 @no_callsites(i32 %a) { ; ; LOWERING-LABEL: define i32 @no_callsites( ; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META4:![0-9]+]] { -; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0) +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB2]], ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 ; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr @@ -232,7 +237,7 @@ define void @no_counters() { ; ; LOWERING-LABEL: define void @no_counters( ; LOWERING-SAME: ) !guid [[META5:![0-9]+]] { -; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_counters, i64 5458232184388660970, i32 1, i32 1) +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB3]], ptr @no_counters, i64 5458232184388660970, i32 1, i32 1) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], 1 ; LOWERING-NEXT: [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee) @@ -260,7 +265,7 @@ define void @inlineasm() { ; ; LOWERING-LABEL: define void @inlineasm( ; LOWERING-SAME: ) !guid [[META6:![0-9]+]] { -; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0) +; LOWERING-NEXT: [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB4]], ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0) ; LOWERING-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; LOWERING-NEXT: [[TMP3:%.*]] = and i64 [[TMP2]], -2 ; LOWERING-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr