Skip to content

Commit

Permalink
* Fix timeouts correctness bug wherein we would blow up the timeout o…
Browse files Browse the repository at this point in the history
…n spurious wake by having the header compare again.

* Remove double indirect call trampoline for the non-lock free case.
  • Loading branch information
BillyONeal committed Aug 1, 2020
1 parent 45b0923 commit 0968ed2
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 90 deletions.
114 changes: 72 additions & 42 deletions stl/inc/atomic
Expand Up @@ -17,7 +17,9 @@
#include <stdint.h>
#include <string.h>
#include <xatomic.h>
#if _HAS_CXX20
#include <xatomic_wait.h>
#endif // _HAS_CXX20

#pragma pack(push, _CRT_PACKING)
#pragma warning(push, _STL_WARNING_LEVEL)
Expand Down Expand Up @@ -360,6 +362,7 @@ template <class _Ty, size_t = _Atomic_storage_traits<_Ty>::_Storage_size>
#endif // TRANSITION, ABI
struct _Atomic_storage;

#if _HAS_CXX20
template <class _Ty, class _Value_type>
void _Atomic_wait_direct(
const _Atomic_storage<_Ty>* const _This, _Value_type _Expected_bytes, const memory_order _Order) noexcept {
Expand All @@ -385,20 +388,50 @@ void _Atomic_wait_direct(
__std_atomic_wait_direct(_Storage_ptr, &_Expected_bytes, sizeof(_Value_type), _Atomic_wait_no_timeout);
}
}
#endif // _HAS_CXX20

template <class _Predicate>
bool __stdcall _Atomic_wait_indirect_callback(const void* const _Param, void* const _Comparand) noexcept {
const auto _Fn = static_cast<const _Predicate*>(_Param);
return (*_Fn)(_Comparand);
#if 1 // TRANSITION, ABI
inline void _Atomic_lock_spinlock(long& _Spinlock) noexcept {
while (_InterlockedExchange(&_Spinlock, 1)) {
_YIELD_PROCESSOR();
}
}

template <class _Predicate>
void _Atomic_wait_indirect(const void* const _Storage, const _Predicate& _Are_equal, void* const _Comparand,
unsigned long _Remaining_timeout) noexcept {
__std_atomic_wait_indirect(
_Storage, _Atomic_wait_indirect_callback<_Predicate>, &_Are_equal, _Comparand, _Remaining_timeout);
inline void _Atomic_unlock_spinlock(long& _Spinlock) noexcept {
#if defined(_M_ARM) || defined(_M_ARM64)
_Memory_barrier();
__iso_volatile_store32(reinterpret_cast<int*>(&_Spinlock), 0);
_Memory_barrier();
#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
_InterlockedExchange(&_Spinlock, 0);
#endif // hardware
}

struct _Spinlock_guard {
long& _Spinlock;
_Spinlock_guard(long& _Spinlock_) noexcept : _Spinlock(_Spinlock_) {
_Atomic_lock_spinlock(_Spinlock);
};
~_Spinlock_guard() {
_Atomic_unlock_spinlock(_Spinlock);
}

_Spinlock_guard(const _Spinlock_guard&) = delete;
_Spinlock_guard& operator=(const _Spinlock_guard&) = delete;
};

#if _HAS_CXX20
inline bool __stdcall _Atomic_wait_compare_non_lock_free(

This comment has been minimized.

Copy link
@AlexGuteniev

AlexGuteniev Aug 1, 2020

Contributor

Make size a template parameter so that the compiler can emit faster memcmp

This comment has been minimized.

Copy link
@AlexGuteniev

AlexGuteniev Aug 1, 2020

Contributor

I removed const from comparand to alter it for padding bits.
Now that no padding bits are handled here, put it back

This comment has been minimized.

Copy link
@AlexGuteniev

AlexGuteniev Aug 1, 2020

Contributor

Note that the trampoline was not double indirect for release build. Lambdas are inlined eve with /Ob1

This comment has been minimized.

Copy link
@BillyONeal

BillyONeal Aug 1, 2020

Author Member

I'm explicitly trying to not do that. This is only engaged for non-lock-free so T isn't one of the sizes that are "fun" for the compiler to generate that stuff, and it increases code size. We can always pay more code size later if profiling indicates it worthwhile.

const void* _Storage, void* _Comparand, size_t _Size, void* _Spinlock_raw) noexcept {
long& _Spinlock = *static_cast<long*>(_Spinlock_raw);
_Atomic_lock_spinlock(_Spinlock);
const auto _Cmp_result = _CSTD memcmp(_Storage, _Comparand, _Size);
_Atomic_unlock_spinlock(_Spinlock);
return _Cmp_result == 0;
}
#endif // _HAS_CXX20
#endif // TRANSITION, ABI

template <class _Ty, size_t /* = ... */>
struct _Atomic_storage {
// Provides operations common to all specializations of std::atomic, load, store, exchange, and CAS.
Expand Down Expand Up @@ -472,34 +505,39 @@ struct _Atomic_storage {

#if _HAS_CXX20
void wait(_Ty _Expected, const memory_order _Order = memory_order_seq_cst) const noexcept {
(void) _Order; // non-lock-free operations are always seq_cst
const auto _Storage_ptr = _STD addressof(_Storage);
const auto _Expected_ptr = _STD addressof(_Expected);

auto _Are_equal = [this, _Order](void* const _Comparand) {
const _Ty _Observed = load(_Order);
const auto _Observed_ptr = _STD addressof(_Observed);
if (_CSTD memcmp(_Observed_ptr, _Comparand, sizeof(_Ty)) == 0) {
return true;
}
for (;;) {
{
_Spinlock_guard _Lock{_Spinlock};
if (_CSTD memcmp(_Storage_ptr, _Expected_ptr, sizeof(_Ty)) != 0) {
// contents differed, we might be done, check for padding
#if _CMPXCHG_MASK_OUT_PADDING_BITS
if constexpr (_Might_have_non_value_bits<_Ty>) {
_Storage_for<_Ty> _Local;
const auto _Local_ptr = _Local._Ptr();
_CSTD memcpy(_Local_ptr, _Observed_ptr, sizeof(_Ty));
__builtin_zero_non_value_bits(_Local_ptr);
__builtin_zero_non_value_bits(reinterpret_cast<_Ty*>(_Comparand));

if (_CSTD memcmp(_Local_ptr, _Comparand, sizeof(_Ty)) == 0) {
_CSTD memcpy(_Comparand, _Observed_ptr, sizeof(_Ty));
return true;
if constexpr (_Might_have_non_value_bits<_Ty>) {
_Storage_for<_Ty> _Local;
const auto _Local_ptr = _Local._Ptr();
_CSTD memcpy(_Local_ptr, _Storage_ptr, sizeof(_Ty));
__builtin_zero_non_value_bits(_Local_ptr);
__builtin_zero_non_value_bits(_Expected_ptr);
if (_CSTD memcmp(_Local_ptr, _Expected_ptr, sizeof(_Ty)) == 0) {
// _Storage differs from _Expected only by padding; copy the padding from _Storage into
// _Expected
_CSTD memcpy(_Expected_ptr, _Storage_ptr, sizeof(_Ty));
} else {
// truly different, we're done
return;
}
} else
#endif // #if _CMPXCHG_MASK_OUT_PADDING_BITS
{
return;
}
}
}
#endif // _CMPXCHG_MASK_OUT_PADDING_BITS
return false;
};
} // unlock

if (_Are_equal(_Expected_ptr)) {
_Atomic_wait_indirect(_Storage_ptr, _Are_equal, _Expected_ptr, _Atomic_wait_no_timeout);
__std_atomic_wait_indirect(_Storage_ptr, _Expected_ptr, sizeof(_Ty), &_Spinlock,
&_Atomic_wait_compare_non_lock_free, _Atomic_wait_no_timeout);
}
}

Expand All @@ -514,19 +552,11 @@ struct _Atomic_storage {

#if 1 // TRANSITION, ABI
void _Lock() const noexcept { // lock the spinlock
while (_InterlockedExchange(&_Spinlock, 1)) {
_YIELD_PROCESSOR();
}
_Atomic_lock_spinlock(_Spinlock);
}

void _Unlock() const noexcept { // unlock the spinlock
#if defined(_M_ARM) || defined(_M_ARM64)
_Memory_barrier();
__iso_volatile_store32(reinterpret_cast<int*>(&_Spinlock), 0);
_Memory_barrier();
#else // ^^^ ARM32/ARM64 hardware / x86/x64 hardware vvv
_InterlockedExchange(&_Spinlock, 0);
#endif // hardware
_Atomic_unlock_spinlock(_Spinlock);
}

private:
Expand Down
7 changes: 4 additions & 3 deletions stl/inc/xatomic_wait.h
Expand Up @@ -48,10 +48,11 @@ void __stdcall __std_atomic_notify_all_direct(const void* _Storage) noexcept;
// The "indirect" functions are used when the size is not 1, 2, 4, or 8; these notionally wait on another value which is
// of one of those sizes whose value changes upon notify, hence "indirect". (As of 2020-07-24, this always uses the
// fallback SRWLOCK and CONDITION_VARIABLE implementation but that is not contractual.)
using _Atomic_wait_indirect_callback_t = bool(__stdcall*)(const void* _Parameter, void* _Comparand);
using _Atomic_wait_indirect_equal_callback_t = bool(__stdcall*)(
const void* _Storage, void* _Comparand, size_t _Size, void* _Param) noexcept;

int __stdcall __std_atomic_wait_indirect(const void* _Storage, _Atomic_wait_indirect_callback_t _Are_equal,
const void* _Parameter, void* _Comparand, unsigned long _Remaining_timeout) noexcept;
int __stdcall __std_atomic_wait_indirect(const void* _Storage, void* _Comparand, size_t _Size, void* _Param,
_Atomic_wait_indirect_equal_callback_t _Are_equal, unsigned long _Remaining_timeout) noexcept;
void __stdcall __std_atomic_notify_one_indirect(const void* _Storage) noexcept;
void __stdcall __std_atomic_notify_all_indirect(const void* _Storage) noexcept;

Expand Down
77 changes: 32 additions & 45 deletions stl/src/atomic_wait.cpp
Expand Up @@ -173,25 +173,25 @@ namespace {
_Wake_by_address_all(Address);
}

bool __stdcall _Atomic_wait_are_equal_8_relaxed(const void* const _Storage, void* const _Comparand) noexcept {
return __iso_volatile_load8(const_cast<const volatile char*>(reinterpret_cast<const char*>(_Storage)))
== *reinterpret_cast<const char*>(_Comparand);
}

bool __stdcall _Atomic_wait_are_equal_16_relaxed(const void* const _Storage, void* const _Comparand) noexcept {
return __iso_volatile_load16(const_cast<const volatile short*>(reinterpret_cast<const short*>(_Storage)))
== *reinterpret_cast<const short*>(_Comparand);
}

bool __stdcall _Atomic_wait_are_equal_32_relaxed(const void* const _Storage, void* const _Comparand) noexcept {
return __iso_volatile_load32(const_cast<const volatile int*>(reinterpret_cast<const int*>(_Storage)))
== *reinterpret_cast<const int*>(_Comparand);
}

bool __stdcall _Atomic_wait_are_equal_64_relaxed(const void* const _Storage, void* const _Comparand) noexcept {
return __iso_volatile_load64(
const_cast<const volatile long long*>(reinterpret_cast<const long long*>(_Storage)))
== *reinterpret_cast<const long long*>(_Comparand);
bool __stdcall _Atomic_wait_are_equal_direct_fallback(
const void* _Storage, void* _Comparand, size_t _Size, void* _Param) noexcept {
(void) _Param;
switch (_Size) {
case 1:
return __iso_volatile_load8(reinterpret_cast<const char*>(_Storage))
== *reinterpret_cast<const char*>(_Comparand);
case 2:
return __iso_volatile_load16(reinterpret_cast<const short*>(_Storage))
== *reinterpret_cast<const short*>(_Comparand);
case 4:
return __iso_volatile_load32(reinterpret_cast<const int*>(_Storage))
== *reinterpret_cast<const int*>(_Comparand);
case 8:
return __iso_volatile_load64(reinterpret_cast<const long long*>(_Storage))
== *reinterpret_cast<const long long*>(_Comparand);
default:
_CSTD abort();
}
}
#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE
} // unnamed namespace
Expand All @@ -202,22 +202,8 @@ int __stdcall __std_atomic_wait_direct(const void* const _Storage, void* const _
const unsigned long _Remaining_timeout) noexcept {
#if _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0
if (_Acquire_wait_functions() < __std_atomic_api_level::__has_wait_on_address) {
switch (_Size) {
case 1:
return __std_atomic_wait_indirect(
_Storage, &_Atomic_wait_are_equal_8_relaxed, _Storage, _Comparand, _Remaining_timeout);
case 2:
return __std_atomic_wait_indirect(
_Storage, &_Atomic_wait_are_equal_16_relaxed, _Storage, _Comparand, _Remaining_timeout);
case 4:
return __std_atomic_wait_indirect(
_Storage, &_Atomic_wait_are_equal_32_relaxed, _Storage, _Comparand, _Remaining_timeout);
case 8:
return __std_atomic_wait_indirect(
_Storage, &_Atomic_wait_are_equal_64_relaxed, _Storage, _Comparand, _Remaining_timeout);
default:
_CSTD abort();
}
return __std_atomic_wait_indirect(
_Storage, _Comparand, _Size, nullptr, &_Atomic_wait_are_equal_direct_fallback, _Remaining_timeout);
}
#endif // _ATOMIC_WAIT_ON_ADDRESS_STATICALLY_AVAILABLE == 0

Expand Down Expand Up @@ -277,23 +263,24 @@ void __stdcall __std_atomic_notify_all_indirect(const void* const _Storage) noex
}
}

int __stdcall __std_atomic_wait_indirect(const void* _Storage, _Atomic_wait_indirect_callback_t _Are_equal,
const void* _Parameter, void* _Comparand, unsigned long _Remaining_timeout) noexcept {
int __stdcall __std_atomic_wait_indirect(const void* const _Storage, void* const _Comparand, const size_t _Size,
void* const _Param, const _Atomic_wait_indirect_equal_callback_t _Are_equal,
const unsigned long _Remaining_timeout) noexcept {
auto& _Entry = _Atomic_wait_table_entry(_Storage);

_SrwLock_guard _Guard(_Entry._Lock);
_Guarded_wait_context _Context{_Storage, &_Entry._Wait_list_head};

for (;;) {
if (!_Are_equal(_Parameter, _Comparand)) {
return TRUE;
}
if (!_Are_equal(_Storage, _Comparand, _Size, _Param)) { // note: under lock
return TRUE;
}

if (!SleepConditionVariableSRW(&_Context._Condition, &_Entry._Lock, _Remaining_timeout, 0)) {
_Assume_timeout();
return FALSE;
}
if (!SleepConditionVariableSRW(&_Context._Condition, &_Entry._Lock, _Remaining_timeout, 0)) {
_Assume_timeout();
return FALSE;
}

return TRUE;
}

unsigned long long __stdcall __std_atomic_wait_get_deadline(const unsigned long long _Timeout) noexcept {
Expand Down

0 comments on commit 0968ed2

Please sign in to comment.