From 0f5156fbe4ea0963163fff2046e814fb357718c0 Mon Sep 17 00:00:00 2001 From: jhugard Date: Thu, 19 Feb 2026 09:30:03 -0800 Subject: [PATCH] Fix data race in AsyncBlockTests with lock-free opcode logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Access violation crash in `AsyncBlockTests::VerifyAsyncBlockReuse` due to concurrent `std::vector::push_back` operations on shared `opCodes` member from overlapping async call lifecycle phases. The test intentionally reuses `XAsyncBlock` and `FactorialCallData` across sequential async calls. When the first call's cleanup (invoked from completion callback) races with the second call's initialization, both threads attempt to push_back into the same `std::vector`, causing heap corruption during vector reallocation. **Crash stack trace:** ``` ucrtbased!_free_dbg (heap corruption during vector realloc) ← std::vector::push_back ← FactorialWorkerSimple (Cleanup opcode from first call) ← AsyncState::~AsyncState ← CompletionCallback [concurrent with] ← FactorialWorkerSimple (Begin/DoWork from second call) ← VerifyAsyncBlockReuse ``` **Detection:** Heisenbug found after 6-hour soak test under Windows CDB with page heap enabled (`gflags /p /enable`). Replace `std::vector opCodes` with fixed-capacity lock-free append buffer: - `std::array, 16>` for storage (capacity exceeds max test depth) - `std::atomic` for thread-safe index allocation - `RecordOp(op)`: atomic fetch-add for index, then `store(memory_order_release)` to array slot - `GetOpCodes()`: snapshot current state into vector via `load(memory_order_acquire)` **Why this approach:** - Aligns with library philosophy of avoiding synchronization primitives - No dynamic allocation eliminates reallocation races - Bounded opcode sequences (max ~9 in distributed factorial tests) - Append-only during async lifecycle, read-only during verification - Proper release-acquire semantics ensure visibility on ARM/weakly-ordered architectures - Natural lock-free semantics: each writer gets unique slot via atomic index - `Tests/UnitTests/Tests/AsyncBlockTests.cpp`: - Added `#include ` for std::array support - Replaced `std::vector opCodes` with `std::atomic` array buffer - Updated `FactorialWorkerSimple` and `FactorialWorkerDistributed` to use `RecordOp()` - Updated all test verification sites to use `GetOpCodes()` snapshot method - Optimized multi-call sites to snapshot once and reuse - **Specific test**: `VerifyAsyncBlockReuse` passes 10/10 rapid runs - **Full suite**: All 23 AsyncBlockTests passed with no regressions - **Note**: Original heisenbug required 6hr soak to reproduce; single-pass testing verifies compilation and basic functionality, but extended soak testing would be needed to fully validate stability under stress - Test-only change, no production code affected - Eliminates data race without introducing mutex overhead - Maintains test semantics and coverage --- Tests/UnitTests/Tests/AsyncBlockTests.cpp | 60 ++++++++++++++++++----- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/Tests/UnitTests/Tests/AsyncBlockTests.cpp b/Tests/UnitTests/Tests/AsyncBlockTests.cpp index 7a77556d..06fd4ae7 100644 --- a/Tests/UnitTests/Tests/AsyncBlockTests.cpp +++ b/Tests/UnitTests/Tests/AsyncBlockTests.cpp @@ -7,6 +7,7 @@ #include "XAsyncProviderPriv.h" #include "XTaskQueue.h" #include "XTaskQueuePriv.h" +#include #define TEST_CLASS_OWNER L"brianpe" @@ -79,13 +80,43 @@ DEFINE_TEST_CLASS(AsyncBlockTests) DWORD result = 0; DWORD iterationWait = 0; DWORD workThread = 0; - std::vector opCodes; + + // Fixed-capacity lock-free opcode log for concurrent append + static constexpr size_t MAX_OPCODES = 16; + std::array, MAX_OPCODES> opCodesArray{}; + std::atomic opCodesCount{ 0 }; + std::atomic inWork = 0; std::atomic refs = 0; std::atomic canceled = false; void AddRef() { refs++; } void Release() { if (--refs == 0) delete this; } + + // Thread-safe append operation + void RecordOp(XAsyncOp op) + { + size_t idx = opCodesCount.fetch_add(1, std::memory_order_relaxed); + if (idx < MAX_OPCODES) + { + opCodesArray[idx].store(op, std::memory_order_release); + } + // Silently drop if overflow (test will fail on verification anyway) + } + + // Snapshot current opcodes into a vector for verification + std::vector GetOpCodes() const + { + size_t count = opCodesCount.load(std::memory_order_acquire); + count = (count < MAX_OPCODES) ? count : MAX_OPCODES; + std::vector result; + result.reserve(count); + for (size_t i = 0; i < count; i++) + { + result.push_back(opCodesArray[i].load(std::memory_order_acquire)); + } + return result; + } }; static PCWSTR OpName(XAsyncOp op) @@ -117,7 +148,7 @@ DEFINE_TEST_CLASS(AsyncBlockTests) { FactorialCallData* d = (FactorialCallData*)data->context; - d->opCodes.push_back(opCode); + d->RecordOp(opCode); switch (opCode) { @@ -166,7 +197,7 @@ DEFINE_TEST_CLASS(AsyncBlockTests) { FactorialCallData* d = (FactorialCallData*)data->context; - d->opCodes.push_back(opCode); + d->RecordOp(opCode); switch (opCode) { @@ -391,7 +422,7 @@ DEFINE_TEST_CLASS(AsyncBlockTests) ops.push_back(XAsyncOp::GetResult); ops.push_back(XAsyncOp::Cleanup); - VerifyOps(data.Ref->opCodes, ops); + VerifyOps(data.Ref->GetOpCodes(), ops); VERIFY_QUEUE_EMPTY(queue); } @@ -480,7 +511,7 @@ DEFINE_TEST_CLASS(AsyncBlockTests) ops.push_back(XAsyncOp::GetResult); ops.push_back(XAsyncOp::Cleanup); - VerifyOps(data.Ref->opCodes, ops); + VerifyOps(data.Ref->GetOpCodes(), ops); VERIFY_QUEUE_EMPTY(queue); } @@ -554,8 +585,9 @@ DEFINE_TEST_CLASS(AsyncBlockTests) Sleep(500); VERIFY_ARE_EQUAL(E_ABORT, hrCallback); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup); + auto opCodes = data.Ref->GetOpCodes(); + VerifyHasOp(opCodes, XAsyncOp::Cancel); + VerifyHasOp(opCodes, XAsyncOp::Cleanup); VERIFY_QUEUE_EMPTY(queue); } @@ -587,9 +619,10 @@ DEFINE_TEST_CLASS(AsyncBlockTests) VERIFY_ARE_EQUAL(XAsyncGetStatus(&async, true), E_ABORT); XTaskQueueDispatch(queue, XTaskQueuePort::Completion, 700); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::DoWork); + auto opCodes = data.Ref->GetOpCodes(); + VerifyHasOp(opCodes, XAsyncOp::Cancel); + VerifyHasOp(opCodes, XAsyncOp::Cleanup); + VerifyHasOp(opCodes, XAsyncOp::DoWork); VERIFY_QUEUE_EMPTY(queue); } @@ -620,9 +653,10 @@ DEFINE_TEST_CLASS(AsyncBlockTests) VERIFY_ARE_EQUAL(XAsyncGetStatus(&async, true), E_ABORT); Sleep(500); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cancel); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::Cleanup); - VerifyHasOp(data.Ref->opCodes, XAsyncOp::DoWork); + auto opCodes = data.Ref->GetOpCodes(); + VerifyHasOp(opCodes, XAsyncOp::Cancel); + VerifyHasOp(opCodes, XAsyncOp::Cleanup); + VerifyHasOp(opCodes, XAsyncOp::DoWork); VERIFY_QUEUE_EMPTY(queue); }