diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
index fe8ddcdf79129..0fc4883305145 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
@@ -122,6 +122,12 @@ class ProfileWriter {
   virtual void startContextSection() = 0;
   virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
   virtual void endContextSection() = 0;
+
+  virtual void startFlatSection() = 0;
+  virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
+                         size_t BufferSize) = 0;
+  virtual void endFlatSection() = 0;
+
   virtual ~ProfileWriter() = default;
 };
 } // namespace ctx_profile
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
index 992aa94a6631d..d7ec8fde4ec7d 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
@@ -8,6 +8,8 @@
 
 #include "CtxInstrProfiling.h"
 #include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_atomic_clang.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_dense_map.h"
 #include "sanitizer_common/sanitizer_libc.h"
@@ -27,6 +29,20 @@ __sanitizer::SpinMutex AllContextsMutex;
 SANITIZER_GUARDED_BY(AllContextsMutex)
 __sanitizer::Vector<ContextRoot *> AllContextRoots;
 
+__sanitizer::atomic_uintptr_t AllFunctionsData = {};
+
+// Keep all the functions for which we collect a flat profile in a linked list.
+__sanitizer::SpinMutex FlatCtxArenaMutex;
+SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
+Arena *FlatCtxArenaHead = nullptr;
+SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
+Arena *FlatCtxArena = nullptr;
+
+// Set to true when we enter a root, and false when we exit - regardless if this
+// thread collects a contextual profile for that root.
+__thread bool IsUnderContext = false;
+__sanitizer::atomic_uint8_t ProfilingStarted = {};
+
 // utility to taint a pointer by setting the LSB. There is an assumption
 // throughout that the addresses of contexts are even (really, they should be
 // align(8), but "even"-ness is the minimum assumption)
@@ -109,7 +125,10 @@ void resetContextNode(ContextNode &Node) {
       resetContextNode(*Next);
 }
 
-void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
+ContextNode *onContextEnter(ContextNode &Node) {
+  ++Node.counters()[0];
+  return &Node;
+}
 
 } // namespace
 
@@ -182,12 +201,75 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
   return Ret;
 }
 
-ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
-                                            uint32_t NumCounters,
+ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
+                            uint32_t NumCounters) {
+  if (ContextNode *Existing = Data.FlatCtx)
+    return Existing;
+  {
+    // We could instead try to take the lock and, if that fails, return
+    // TheScratchContext. But that could leave message pump loops more sparsely
+    // profiled than everything else. Maybe that doesn't matter, and we can
+    // optimize this later.
+    __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
+    if (ContextNode *Existing = Data.FlatCtx)
+      return Existing;
+
+    auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
+    char *AllocBuff = nullptr;
+    {
+      __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
+          &FlatCtxArenaMutex);
+      if (FlatCtxArena)
+        AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
+      if (!AllocBuff) {
+        FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
+                                               FlatCtxArena);
+        AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
+      }
+      if (!FlatCtxArenaHead)
+        FlatCtxArenaHead = FlatCtxArena;
+    }
+    auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
+    Data.FlatCtx = Ret;
+
+    Data.Next = reinterpret_cast<FunctionData *>(
+        __sanitizer::atomic_load_relaxed(&AllFunctionsData));
+    while (!__sanitizer::atomic_compare_exchange_strong(
+        &AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
+        reinterpret_cast<uintptr_t>(&Data),
+        __sanitizer::memory_order_release)) {
+    }
+  }
+
+  return Data.FlatCtx;
+}
+
+ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
+                                 uint32_t NumCounters) {
+  // 1) if we are under a root (regardless if this thread is collecting or not a
+  // contextual profile for that root), do not collect a flat profile. We want
+  // to keep flat profiles only for activations that can't happen under a root,
+  // to avoid confusing profiles. We can, for example, combine flattened and
+  // flat profiles meaningfully, as we wouldn't double-count anything.
+  //
+  // 2) to avoid lengthy startup, don't bother with flat profiles until the
+  // profiling started. We would reset them anyway when profiling starts.
+  // HOWEVER. This does lose profiling for message pumps: those functions are
+  // entered once and never exit. They should be assumed to be entered before
+  // profiling starts - because profiling should start after the server is up
+  // and running (which is equivalent to "message pumps are set up").
+  if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
+    return TheScratchContext;
+  return markAsScratch(
+      onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
+}
+
+ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
+                                            GUID Guid, uint32_t NumCounters,
                                             uint32_t NumCallsites) {
   // fast "out" if we're not even doing contextual collection.
   if (!__llvm_ctx_profile_current_context_root)
-    return TheScratchContext;
+    return getUnhandledContext(*Data, Guid, NumCounters);
 
   // also fast "out" if the caller is scratch. We can see if it's scratch by
   // looking at the interior pointer into the subcontexts vector that the caller
@@ -196,7 +278,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
   // precisely, aligned - 8 values)
   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
   if (!CallsiteContext || isScratch(CallsiteContext))
-    return TheScratchContext;
+    return getUnhandledContext(*Data, Guid, NumCounters);
 
   // if the callee isn't the expected one, return scratch.
   // Signal handler(s) could have been invoked at any point in the execution.
@@ -214,7 +296,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
   // for that case.
   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
   if (ExpectedCallee != Callee)
-    return TheScratchContext;
+    return getUnhandledContext(*Data, Guid, NumCounters);
 
   auto *Callsite = *CallsiteContext;
   // in the case of indirect calls, we will have all seen targets forming a
@@ -257,6 +339,7 @@ void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
 ContextNode *__llvm_ctx_profile_start_context(
     ContextRoot *Root, GUID Guid, uint32_t Counters,
     uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  IsUnderContext = true;
   if (!Root->FirstMemBlock) {
     setupContext(Root, Guid, Counters, Callsites);
   }
@@ -272,6 +355,7 @@ ContextNode *__llvm_ctx_profile_start_context(
 
 void __llvm_ctx_profile_release_context(ContextRoot *Root)
     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
+  IsUnderContext = false;
   if (__llvm_ctx_profile_current_context_root) {
     __llvm_ctx_profile_current_context_root = nullptr;
     Root->Taken.Unlock();
@@ -291,10 +375,12 @@ void __llvm_ctx_profile_start_collection() {
 
     resetContextNode(*Root->FirstNode);
   }
+  __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
   __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
 }
 
 bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
+  __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
       &AllContextsMutex);
 
@@ -310,17 +396,43 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
     Writer.writeContextual(*Root->FirstNode);
   }
   Writer.endContextSection();
+  Writer.startFlatSection();
+  // The list progresses behind the head, so taking this snapshot allows the
+  // list to grow concurrently without causing a race condition with our
+  // traversing it.
+  const auto *Pos = reinterpret_cast<const FunctionData *>(
+      __sanitizer::atomic_load_relaxed(&AllFunctionsData));
+  for (; Pos; Pos = Pos->Next)
+    Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
+                     Pos->FlatCtx->counters_size());
+  Writer.endFlatSection();
   return true;
 }
 
 void __llvm_ctx_profile_free() {
-  __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
-      &AllContextsMutex);
-  for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
-    for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+  __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
+  {
+    __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+        &AllContextsMutex);
+    for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
+      for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
+        auto *C = A;
+        A = A->next();
+        __sanitizer::InternalFree(C);
+      }
+    AllContextRoots.Reset();
+  }
+  __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
+  {
+    __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
+        &FlatCtxArenaMutex);
+    FlatCtxArena = nullptr;
+    for (auto *A = FlatCtxArenaHead; A;) {
       auto *C = A;
-      A = A->next();
+      A = C->next();
       __sanitizer::InternalFree(C);
     }
-  AllContextRoots.Reset();
+
+    FlatCtxArenaHead = nullptr;
+  }
 }
diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
index 8a6949d4ec288..c41a77457178c 100644
--- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
+++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
@@ -113,6 +113,28 @@ struct ContextRoot {
   static_assert(sizeof(Taken) == 1);
 };
 
+// This is allocated and zero-initialized by the compiler, the in-place
+// initialization serves mostly as self-documentation and for testing.
+// The design is influenced by the observation that typically (at least for
+// datacenter binaries, which is the motivating target of this profiler) less
+// than 10% of functions in a binary even appear in a profile (of any kind).
+//
+// 1) We could pre-allocate the flat profile storage in the compiler, just like
+// the flat instrumented profiling does. But that penalizes the static size of
+// the binary for little reason
+//
+// 2) We could do the above but zero-initialize the buffers (which should place
+// them in .bss), and dynamically populate them. This, though, would page-in
+// more memory upfront for the binary's runtime
+//
+// The current design trades off a bit of overhead at the first time a function
+// is encountered *for flat profiling* for avoiding size penalties.
+struct FunctionData {
+  FunctionData *Next = nullptr;
+  ContextNode *volatile FlatCtx = nullptr;
+  ::__sanitizer::StaticSpinMutex Mutex;
+};
+
 /// This API is exposed for testing. See the APIs below about the contract with
 /// LLVM.
 inline bool isScratch(const void *Ctx) {
@@ -152,7 +174,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
 
 /// called for any other function than entry points, in the entry BB of such
 /// function. Same consideration about LSB of returned value as .._start_context
-ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
+ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *Data,
+                                            void *Callee, GUID Guid,
                                             uint32_t NumCounters,
                                             uint32_t NumCallsites);
 
diff --git a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
index 97292f9f1abff..01a8274774ecb 100644
--- a/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
+++ b/compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp
@@ -40,6 +40,7 @@ TEST(ArenaTest, Basic) {
 }
 
 TEST_F(ContextTest, Basic) {
+  __llvm_ctx_profile_start_collection();
   auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
   ASSERT_NE(Ctx, nullptr);
   EXPECT_NE(Root.CurrentMem, nullptr);
@@ -58,6 +59,7 @@ TEST_F(ContextTest, Basic) {
 }
 
 TEST_F(ContextTest, Callsite) {
+  __llvm_ctx_profile_start_collection();
   auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
   int FakeCalleeAddress = 0;
   const bool IsScratch = isScratch(Ctx);
@@ -67,7 +69,11 @@ TEST_F(ContextTest, Callsite) {
   __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
   // This is what the callee does
-  auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  FunctionData FData = {0};
+  auto *Subctx =
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+  // This should not have required creating a flat context.
+  EXPECT_EQ(FData.FlatCtx, nullptr);
   // We expect the subcontext to be appropriately placed and dimensioned
   EXPECT_EQ(Ctx->subContexts()[2], Subctx);
   EXPECT_EQ(Subctx->counters_size(), 3U);
@@ -81,29 +87,59 @@ TEST_F(ContextTest, Callsite) {
   __llvm_ctx_profile_release_context(&Root);
 }
 
-TEST_F(ContextTest, ScratchNoCollection) {
+TEST_F(ContextTest, ScratchNoCollectionProfilingNotStarted) {
+  // This test intentionally does not call __llvm_ctx_profile_start_collection.
   EXPECT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
   int FakeCalleeAddress = 0;
   // this would be the very first function executing this. the TLS is empty,
   // too.
-  auto *Ctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  FunctionData FData = {0};
+  auto *Ctx =
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+  // We never entered a context (_start_context was never called) - so the
+  // returned context must be a tagged pointer.
+  EXPECT_TRUE(isScratch(Ctx));
+  // Because we didn't start collection, no flat profile should have been
+  // allocated.
+  EXPECT_EQ(FData.FlatCtx, nullptr);
+}
+
+TEST_F(ContextTest, ScratchNoCollectionProfilingStarted) {
+  ASSERT_EQ(__llvm_ctx_profile_current_context_root, nullptr);
+  int FakeCalleeAddress = 0;
+  // Start collection, so the function gets a flat profile instead of scratch.
+  __llvm_ctx_profile_start_collection();
+  // this would be the very first function executing this. the TLS is empty,
+  // too.
+  FunctionData FData = {0};
+  auto *Ctx =
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
   // We never entered a context (_start_context was never called) - so the
-  // returned context must be scratch.
+  // returned context must be a tagged pointer.
   EXPECT_TRUE(isScratch(Ctx));
+  // Because we never entered a context, we should have allocated a flat context
+  EXPECT_NE(FData.FlatCtx, nullptr);
+  EXPECT_EQ(reinterpret_cast<uintptr_t>(FData.FlatCtx) + 1,
+            reinterpret_cast<uintptr_t>(Ctx));
 }
 
 TEST_F(ContextTest, ScratchDuringCollection) {
+  __llvm_ctx_profile_start_collection();
   auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
   int FakeCalleeAddress = 0;
   int OtherFakeCalleeAddress = 0;
   __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
-  auto *Subctx =
-      __llvm_ctx_profile_get_context(&OtherFakeCalleeAddress, 2, 3, 1);
+  FunctionData FData[3] = {0};
+  auto *Subctx = __llvm_ctx_profile_get_context(
+      &FData[0], &OtherFakeCalleeAddress, 2, 3, 1);
   // We expected a different callee - so return scratch. It mimics what happens
   // in the case of a signal handler - in this case, OtherFakeCalleeAddress is
   // the signal handler.
   EXPECT_TRUE(isScratch(Subctx));
+  // We shouldn't have tried to return a flat context because we're under a
+  // root.
+  EXPECT_EQ(FData[0].FlatCtx, nullptr);
   EXPECT_EQ(__llvm_ctx_profile_expected_callee[0], nullptr);
   EXPECT_EQ(__llvm_ctx_profile_callsite[0], nullptr);
 
@@ -111,24 +147,27 @@ TEST_F(ContextTest, ScratchDuringCollection) {
   __llvm_ctx_profile_expected_callee[1] = &ThirdFakeCalleeAddress;
   __llvm_ctx_profile_callsite[1] = &Subctx->subContexts()[0];
 
-  auto *Subctx2 =
-      __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+  auto *Subctx2 = __llvm_ctx_profile_get_context(
+      &FData[1], &ThirdFakeCalleeAddress, 3, 0, 0);
   // We again expect scratch because the '0' position is where the runtime
   // looks, so it doesn't matter the '1' position is populated correctly.
   EXPECT_TRUE(isScratch(Subctx2));
+  EXPECT_EQ(FData[1].FlatCtx, nullptr);
 
   __llvm_ctx_profile_expected_callee[0] = &ThirdFakeCalleeAddress;
   __llvm_ctx_profile_callsite[0] = &Subctx->subContexts()[0];
-  auto *Subctx3 =
-      __llvm_ctx_profile_get_context(&ThirdFakeCalleeAddress, 3, 0, 0);
+  auto *Subctx3 = __llvm_ctx_profile_get_context(
+      &FData[2], &ThirdFakeCalleeAddress, 3, 0, 0);
   // We expect scratch here, too, because the value placed in
   // __llvm_ctx_profile_callsite is scratch
   EXPECT_TRUE(isScratch(Subctx3));
+  EXPECT_EQ(FData[2].FlatCtx, nullptr);
 
   __llvm_ctx_profile_release_context(&Root);
 }
 
 TEST_F(ContextTest, NeedMoreMemory) {
+  __llvm_ctx_profile_start_collection();
   auto *Ctx = __llvm_ctx_profile_start_context(&Root, 1, 10, 4);
   int FakeCalleeAddress = 0;
   const bool IsScratch = isScratch(Ctx);
@@ -136,9 +175,11 @@ TEST_F(ContextTest, NeedMoreMemory) {
   const auto *CurrentMem = Root.CurrentMem;
   __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
+  FunctionData FData = {0};
   // Allocate a massive subcontext to force new arena allocation
   auto *Subctx =
-      __llvm_ctx_profile_get_context(&FakeCalleeAddress, 3, 1 << 20, 1);
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 3, 1 << 20, 1);
+  EXPECT_EQ(FData.FlatCtx, nullptr);
   EXPECT_EQ(Ctx->subContexts()[2], Subctx);
   EXPECT_NE(CurrentMem, Root.CurrentMem);
   EXPECT_NE(Root.CurrentMem, nullptr);
@@ -175,7 +216,9 @@ TEST_F(ContextTest, Dump) {
   int FakeCalleeAddress = 0;
   __llvm_ctx_profile_expected_callee[0] = &FakeCalleeAddress;
   __llvm_ctx_profile_callsite[0] = &Ctx->subContexts()[2];
-  auto *Subctx = __llvm_ctx_profile_get_context(&FakeCalleeAddress, 2, 3, 1);
+  FunctionData FData = {0};
+  auto *Subctx =
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
   (void)Subctx;
   __llvm_ctx_profile_release_context(&Root);
 
@@ -186,6 +229,9 @@ TEST_F(ContextTest, Dump) {
 
     int EnteredSectionCount = 0;
     int ExitedSectionCount = 0;
+    int EnteredFlatCount = 0;
+    int ExitedFlatCount = 0;
+    int FlatsWritten = 0;
 
     bool State = false;
 
@@ -217,6 +263,16 @@ TEST_F(ContextTest, Dump) {
       EXPECT_EQ(EnteredSectionCount, 1);
       ++ExitedSectionCount;
     }
+    void startFlatSection() override { ++EnteredFlatCount; }
+    void writeFlat(GUID Guid, const uint64_t *Buffer,
+                   size_t BufferSize) override {
+      ++FlatsWritten;
+      EXPECT_EQ(BufferSize, 3);
+      EXPECT_EQ(Buffer[0], 15U);
+      EXPECT_EQ(Buffer[1], 0U);
+      EXPECT_EQ(Buffer[2], 0U);
+    }
+    void endFlatSection() override { ++ExitedFlatCount; }
   };
 
   TestProfileWriter W(&Root, 1);
@@ -226,10 +282,17 @@ TEST_F(ContextTest, Dump) {
 
   // this resets all counters but not the internal structure.
   __llvm_ctx_profile_start_collection();
+  auto *Flat =
+      __llvm_ctx_profile_get_context(&FData, &FakeCalleeAddress, 2, 3, 1);
+  EXPECT_NE(FData.FlatCtx, nullptr);
+  FData.FlatCtx->counters()[0] = 15U;
   TestProfileWriter W2(&Root, 0);
   EXPECT_FALSE(W2.State);
   __llvm_ctx_profile_fetch(W2);
   EXPECT_TRUE(W2.State);
   EXPECT_EQ(W2.EnteredSectionCount, 1);
   EXPECT_EQ(W2.ExitedSectionCount, 1);
+  EXPECT_EQ(W2.EnteredFlatCount, 1);
+  EXPECT_EQ(W2.FlatsWritten, 1);
+  EXPECT_EQ(W2.ExitedFlatCount, 1);
 }
diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
index cdf819cbefc3b..bf33b4423fd1f 100644
--- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
+++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
@@ -15,6 +15,7 @@
 #include <iostream>
 
 using namespace llvm::ctx_profile;
+extern "C" void __llvm_ctx_profile_start_collection();
 extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);
 
 // avoid name mangling
@@ -36,6 +37,15 @@ __attribute__((noinline)) void theRoot() {
     someFunction(I);
   }
 }
+
+__attribute__((noinline)) void flatFct() {
+  printf("flat check 1\n");
+  someFunction(1);
+#pragma nounroll
+  for (auto I = 0; I < 2; ++I) {
+    someFunction(I);
+  }
+}
 }
 
 // Make sure the program actually ran correctly.
@@ -43,6 +53,10 @@ __attribute__((noinline)) void theRoot() {
 // CHECK-NEXT: check odd
 // CHECK-NEXT: check even
 // CHECK-NEXT: check odd
+// CHECK-NEXT: flat check 1
+// CHECK-NEXT: check odd
+// CHECK-NEXT: check even
+// CHECK-NEXT: check odd
 
 class TestProfileWriter : public ProfileWriter {
   void printProfile(const ContextNode &Node, const std::string &Indent,
@@ -73,6 +87,22 @@ class TestProfileWriter : public ProfileWriter {
   void writeContextual(const ContextNode &RootNode) override {
     printProfile(RootNode, "", "");
   }
+
+  void startFlatSection() override {
+    std::cout << "Entered Flat Section" << std::endl;
+  }
+
+  void writeFlat(GUID Guid, const uint64_t *Buffer,
+                 size_t BufferSize) override {
+    std::cout << "Flat: " << Guid << " " << Buffer[0];
+    for (size_t I = 1U; I < BufferSize; ++I)
+      std::cout << "," << Buffer[I];
+    std::cout << std::endl;
+  };
+
+  void endFlatSection() override {
+    std::cout << "Exited Flat Section" << std::endl;
+  }
 };
 
 // 8657661246551306189 is theRoot. We expect 2 callsites and 2 counters - one
@@ -100,6 +130,11 @@ class TestProfileWriter : public ProfileWriter {
 // CHECK-NEXT:   2 counters and 2 callsites
 // CHECK-NEXT:   Counter values: 2 1
 // CHECK-NEXT: Exited Context Section
+// CHECK-NEXT: Entered Flat Section
+// CHECK-NEXT: Flat: 6759619411192316602 3,1
+// This is flatFct (guid: 14569438697463215220)
+// CHECK-NEXT: Flat: 14569438697463215220 1,2
+// CHECK-NEXT: Exited Flat Section
 
 bool profileWriter() {
   TestProfileWriter W;
@@ -107,7 +142,9 @@ bool profileWriter() {
 }
 
 int main(int argc, char **argv) {
+  __llvm_ctx_profile_start_collection();
   theRoot();
+  flatFct();
   // This would be implemented in a specific RPC handler, but here we just call
   // it directly.
   return !profileWriter();
diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
index fe8ddcdf79129..0fc4883305145 100644
--- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
+++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h
@@ -122,6 +122,12 @@ class ProfileWriter {
   virtual void startContextSection() = 0;
   virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
   virtual void endContextSection() = 0;
+
+  virtual void startFlatSection() = 0;
+  virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
+                         size_t BufferSize) = 0;
+  virtual void endFlatSection() = 0;
+
   virtual ~ProfileWriter() = default;
 };
 } // namespace ctx_profile
diff --git a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
index 40f355f99eb53..c5a724d9a2142 100644
--- a/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
+++ b/llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
@@ -87,10 +87,10 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {
   void writeContextual(const ctx_profile::ContextNode &RootNode) override;
   void endContextSection() override;
 
-  void startFlatSection();
+  void startFlatSection() override;
   void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
-                 size_t BufferSize);
-  void endFlatSection();
+                 size_t BufferSize) override;
+  void endFlatSection() override;
 
   // constants used in writing which a reader may find useful.
   static constexpr unsigned CodeLen = 2;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
index aa6bee23ad5ff..ffc2aec77ff91 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOCtxProfLowering.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -53,6 +54,7 @@ class CtxInstrumentationLowerer final {
   ModuleAnalysisManager &MAM;
   Type *ContextNodeTy = nullptr;
   Type *ContextRootTy = nullptr;
+  Type *FunctionDataTy = nullptr;
 
   DenseMap<const Function *, Constant *> ContextRootMap;
   Function *StartCtx = nullptr;
@@ -120,6 +122,12 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
                                           PointerTy,          /*CurrentMem*/
                                           SanitizerMutexType, /*Taken*/
                                       });
+  FunctionDataTy =
+      StructType::get(M.getContext(), {
+                                          PointerTy,          /*FlatCtx*/
+                                          SanitizerMutexType, /*Mutex*/
+                                      });
+
   // The Context header.
   ContextNodeTy = StructType::get(M.getContext(), {
                                                       I64Ty,     /*Guid*/
@@ -163,7 +171,8 @@ CtxInstrumentationLowerer::CtxInstrumentationLowerer(Module &M,
   GetCtx = cast<Function>(
       M.getOrInsertFunction(CompilerRtAPINames::GetCtx,
                             FunctionType::get(PointerTy,
-                                              {PointerTy, /*Callee*/
+                                              {PointerTy, /*FunctionData*/
+                                               PointerTy, /*Callee*/
                                                I64Ty,     /*Guid*/
                                                I32Ty,     /*NumCounters*/
                                                I32Ty},    /*NumCallsites*/
@@ -224,7 +233,6 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
       assert(Mark->getIndex()->isZero());
 
       IRBuilder<> Builder(Mark);
-
       Guid = Builder.getInt64(
           AssignGUIDPass::getGUID(cast<Function>(*Mark->getNameValue())));
       // The type of the context of this function is now knowable since we have
@@ -248,9 +256,14 @@ bool CtxInstrumentationLowerer::lowerFunction(Function &F) {
         ORE.emit(
             [&] { return OptimizationRemark(DEBUG_TYPE, "Entrypoint", &F); });
       } else {
-        Context =
-            Builder.CreateCall(GetCtx, {&F, Guid, Builder.getInt32(NumCounters),
-                                        Builder.getInt32(NumCallsites)});
+        // Make up a compact name, these names end up taking up a lot of space
+        // in the binary.
+        auto *FData = new GlobalVariable(
+            M, FunctionDataTy, false, GlobalVariable::InternalLinkage,
+            Constant::getNullValue(FunctionDataTy));
+        Context = Builder.CreateCall(GetCtx, {FData, &F, Guid,
+                                              Builder.getInt32(NumCounters),
+                                              Builder.getInt32(NumCallsites)});
         ORE.emit([&] {
           return OptimizationRemark(DEBUG_TYPE, "RegularFunction", &F);
         });
diff --git a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
index 1927060de868e..e4a5ebdc818e6 100644
--- a/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
+++ b/llvm/test/Transforms/PGOProfile/ctx-instrumentation.ll
@@ -13,6 +13,11 @@ declare void @bar()
 ; LOWERING: @another_entrypoint_no_callees_ctx_root = global { ptr, ptr, ptr, i8 } zeroinitializer
 ; LOWERING: @__llvm_ctx_profile_callsite = external hidden thread_local global ptr
 ; LOWERING: @__llvm_ctx_profile_expected_callee = external hidden thread_local global ptr
+; LOWERING: @[[GLOB0:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB1:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB2:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB3:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
+; LOWERING: @[[GLOB4:[0-9]+]] = internal global { ptr, i8 } zeroinitializer
 ;.
 define void @foo(i32 %a, ptr %fct) {
 ; INSTRUMENT-LABEL: define void @foo(
@@ -34,7 +39,7 @@ define void @foo(i32 %a, ptr %fct) {
 ;
 ; LOWERING-LABEL: define void @foo(
 ; LOWERING-SAME: i32 [[A:%.*]], ptr [[FCT:%.*]]) !guid [[META0:![0-9]+]] {
-; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @foo, i64 6699318081062747564, i32 2, i32 2)
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB0]], ptr @foo, i64 6699318081062747564, i32 2, i32 2)
 ; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 1
 ; LOWERING-NEXT:    [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
@@ -176,7 +181,7 @@ define void @simple(i32 %a) {
 ;
 ; LOWERING-LABEL: define void @simple(
 ; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META3:![0-9]+]] {
-; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @simple, i64 -3006003237940970099, i32 1, i32 0)
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB1]], ptr @simple, i64 -3006003237940970099, i32 1, i32 0)
 ; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
 ; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -200,7 +205,7 @@ define i32 @no_callsites(i32 %a) {
 ;
 ; LOWERING-LABEL: define i32 @no_callsites(
 ; LOWERING-SAME: i32 [[A:%.*]]) !guid [[META4:![0-9]+]] {
-; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0)
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB2]], ptr @no_callsites, i64 5679753335911435902, i32 2, i32 0)
 ; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
 ; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr
@@ -232,7 +237,7 @@ define void @no_counters() {
 ;
 ; LOWERING-LABEL: define void @no_counters(
 ; LOWERING-SAME: ) !guid [[META5:![0-9]+]] {
-; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @no_counters, i64 5458232184388660970, i32 1, i32 1)
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB3]], ptr @no_counters, i64 5458232184388660970, i32 1, i32 1)
 ; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], 1
 ; LOWERING-NEXT:    [[TMP4:%.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__llvm_ctx_profile_expected_callee)
@@ -260,7 +265,7 @@ define void @inlineasm() {
 ;
 ; LOWERING-LABEL: define void @inlineasm(
 ; LOWERING-SAME: ) !guid [[META6:![0-9]+]] {
-; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0)
+; LOWERING-NEXT:    [[TMP1:%.*]] = call ptr @__llvm_ctx_profile_get_context(ptr @[[GLOB4]], ptr @inlineasm, i64 -3771893999295659109, i32 1, i32 0)
 ; LOWERING-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; LOWERING-NEXT:    [[TMP3:%.*]] = and i64 [[TMP2]], -2
 ; LOWERING-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr