Skip to content

Commit 1a5af34

Browse files
ldionnedvyukov
andauthored
[libc++] Speed up classic locale (take 2) (#73533)
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads. This is a re-application of f8afc53 (aka PR #72112) which was reverted in 4e0c48b because it broke the sanitizer builds due to an initialization order fiasco. This issue has now been fixed by ensuring that the locale is constinit'ed. Co-authored-by: Dmitry Vyukov <dvyukov@google.com>
1 parent fa712b0 commit 1a5af34

File tree

6 files changed

+172
-55
lines changed

6 files changed

+172
-55
lines changed
Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
#include "benchmark/benchmark.h"
22
#include "test_macros.h"
33

4+
#include <mutex>
45
#include <sstream>
56

67
TEST_NOINLINE double istream_numbers();
78

8-
double istream_numbers() {
9+
double istream_numbers(std::locale* loc) {
910
const char* a[] = {"-6 69 -71 2.4882e-02 -100 101 -2.00005 5000000 -50000000",
1011
"-25 71 7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
1112
"-14 53 46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,73 @@ double istream_numbers() {
1415
double f1 = 0.0, f2 = 0.0, q = 0.0;
1516
for (int i = 0; i < 3; i++) {
1617
std::istringstream s(a[i]);
18+
if (loc)
19+
s.imbue(*loc);
1720
s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
1821
q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
1922
}
2023
return q;
2124
}
2225

26+
struct LocaleSelector {
27+
std::locale* imbue;
28+
std::locale old;
29+
static std::mutex mutex;
30+
31+
LocaleSelector(benchmark::State& state) {
32+
std::lock_guard guard(mutex);
33+
switch (state.range(0)) {
34+
case 0: {
35+
old = std::locale::global(std::locale::classic());
36+
imbue = nullptr;
37+
break;
38+
}
39+
case 1: {
40+
old = std::locale::global(std::locale::classic());
41+
thread_local std::locale loc("en_US.UTF-8");
42+
imbue = &loc;
43+
break;
44+
}
45+
case 2: {
46+
old = std::locale::global(std::locale::classic());
47+
static std::locale loc("en_US.UTF-8");
48+
imbue = &loc;
49+
break;
50+
}
51+
case 3: {
52+
old = std::locale::global(std::locale("en_US.UTF-8"));
53+
imbue = nullptr;
54+
break;
55+
}
56+
}
57+
}
58+
59+
~LocaleSelector() {
60+
std::lock_guard guard(mutex);
61+
std::locale::global(old);
62+
}
63+
};
64+
65+
std::mutex LocaleSelector::mutex;
66+
2367
static void BM_Istream_numbers(benchmark::State& state) {
68+
LocaleSelector sel(state);
2469
double i = 0;
2570
while (state.KeepRunning())
26-
benchmark::DoNotOptimize(i += istream_numbers());
71+
benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
72+
}
73+
BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
74+
75+
static void BM_Ostream_number(benchmark::State& state) {
76+
LocaleSelector sel(state);
77+
while (state.KeepRunning()) {
78+
std::ostringstream ss;
79+
if (sel.imbue)
80+
ss.imbue(*sel.imbue);
81+
ss << 0;
82+
benchmark::DoNotOptimize(ss.str().c_str());
83+
}
2784
}
85+
BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
2886

29-
BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
3087
BENCHMARK_MAIN();

libcxx/include/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,7 @@ set(files
850850
__utility/integer_sequence.h
851851
__utility/is_pointer_in_range.h
852852
__utility/move.h
853+
__utility/no_destroy.h
853854
__utility/pair.h
854855
__utility/piecewise_construct.h
855856
__utility/priority_tag.h

libcxx/include/__locale

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <__memory/shared_ptr.h> // __shared_count
1616
#include <__mutex/once_flag.h>
1717
#include <__type_traits/make_unsigned.h>
18+
#include <__utility/no_destroy.h>
1819
#include <cctype>
1920
#include <clocale>
2021
#include <cstdint>

libcxx/include/__utility/no_destroy.h

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef _LIBCPP___UTILITY_NO_DESTROY_H
10+
#define _LIBCPP___UTILITY_NO_DESTROY_H
11+
12+
#include <__config>
13+
#include <__type_traits/is_constant_evaluated.h>
14+
#include <__utility/forward.h>
15+
16+
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
17+
# pragma GCC system_header
18+
#endif
19+
20+
_LIBCPP_BEGIN_NAMESPACE_STD
21+
22+
struct __uninitialized_tag {};
23+
24+
// This class stores an object of type _Tp but never destroys it.
25+
//
26+
// This is akin to using __attribute__((no_destroy)), except that it is possible
27+
// to control the lifetime of the object with more flexibility by deciding e.g.
28+
// whether to initialize the object at construction or to defer to a later
29+
// initialization using __emplace.
30+
template <class _Tp>
31+
struct __no_destroy {
32+
_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(__uninitialized_tag) : __dummy_() {
33+
if (__libcpp_is_constant_evaluated()) {
34+
__dummy_ = char();
35+
}
36+
}
37+
_LIBCPP_HIDE_FROM_ABI ~__no_destroy() {
38+
// nothing
39+
}
40+
41+
template <class... _Args>
42+
_LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI explicit __no_destroy(_Args&&... __args)
43+
: __obj_(std::forward<_Args>(__args)...) {}
44+
45+
template <class... _Args>
46+
_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __emplace(_Args&&... __args) {
47+
new (&__obj_) _Tp(std::forward<_Args>(__args)...);
48+
return __obj_;
49+
}
50+
51+
_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp& __get() { return __obj_; }
52+
_LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI _Tp const& __get() const { return __obj_; }
53+
54+
private:
55+
union {
56+
_Tp __obj_;
57+
char __dummy_; // so we can initialize a member even with __uninitialized_tag for constexpr-friendliness
58+
};
59+
};
60+
61+
_LIBCPP_END_NAMESPACE_STD
62+
63+
#endif // _LIBCPP___UTILITY_NO_DESTROY_H

libcxx/include/module.modulemap.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2043,6 +2043,7 @@ module std_private_utility_move [system] {
20432043
export std_private_type_traits_is_nothrow_move_constructible
20442044
export std_private_type_traits_remove_reference
20452045
}
2046+
module std_private_utility_no_destroy [system] { header "__utility/no_destroy.h" }
20462047
module std_private_utility_pair [system] {
20472048
header "__utility/pair.h"
20482049
export std_private_ranges_subrange_fwd

libcxx/src/locale.cpp

Lines changed: 46 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <__utility/no_destroy.h>
910
#include <algorithm>
1011
#include <clocale>
1112
#include <codecvt>
@@ -81,9 +82,8 @@ locale_t __cloc() {
8182

8283
namespace {
8384

84-
struct release
85-
{
86-
void operator()(locale::facet* p) {p->__release_shared();}
85+
struct releaser {
86+
void operator()(locale::facet* p) { p->__release_shared(); }
8787
};
8888

8989
template <class T, class ...Args>
@@ -155,7 +155,11 @@ class _LIBCPP_HIDDEN locale::__imp
155155
{return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
156156
const locale::facet* use_facet(long id) const;
157157

158-
private:
158+
void acquire();
159+
void release();
160+
static __no_destroy<__imp> classic_locale_imp_;
161+
162+
private:
159163
void install(facet* f, long id);
160164
template <class F> void install(F* f) {install(f, f->id.__get());}
161165
template <class F> void install_from(const __imp& other);
@@ -500,7 +504,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
500504
name_("*")
501505
{
502506
f->__add_shared();
503-
unique_ptr<facet, release> hold(f);
507+
unique_ptr<facet, releaser> hold(f);
504508
facets_ = other.facets_;
505509
for (unsigned i = 0; i < other.facets_.size(); ++i)
506510
if (facets_[i])
@@ -519,7 +523,7 @@ void
519523
locale::__imp::install(facet* f, long id)
520524
{
521525
f->__add_shared();
522-
unique_ptr<facet, release> hold(f);
526+
unique_ptr<facet, releaser> hold(f);
523527
if (static_cast<size_t>(id) >= facets_.size())
524528
facets_.resize(static_cast<size_t>(id+1));
525529
if (facets_[static_cast<size_t>(id)])
@@ -537,89 +541,79 @@ locale::__imp::use_facet(long id) const
537541

538542
// locale
539543

540-
// This class basically implements __attribute__((no_destroy)), which isn't supported
541-
// by GCC as of writing this.
542-
template <class T>
543-
struct __no_destroy {
544-
template <class... Args>
545-
explicit __no_destroy(Args&&... args) {
546-
T* obj = reinterpret_cast<T*>(&buf);
547-
new (obj) T(std::forward<Args>(args)...);
548-
}
549-
550-
T& get() { return *reinterpret_cast<T*>(&buf); }
551-
T const& get() const { return *reinterpret_cast<T const*>(&buf); }
552-
553-
private:
554-
alignas(T) byte buf[sizeof(T)];
555-
};
544+
// We don't do reference counting on the classic locale.
545+
// It's never destroyed anyway, but atomic reference counting may be very
546+
// expensive in parallel applications. The classic locale is used by default
547+
// in all streams. Note: if a new global locale is installed, then we lose
548+
// the benefit of no reference counting.
549+
constinit __no_destroy<locale::__imp>
550+
locale::__imp::classic_locale_imp_(__uninitialized_tag{}); // initialized below in classic()
556551

557552
const locale& locale::classic() {
558-
static const __no_destroy<locale> c(__private_tag{}, &make<__imp>(1u));
559-
return c.get();
553+
static const __no_destroy<locale> classic_locale(__private_tag{}, [] {
554+
// executed exactly once on first initialization of `classic_locale`
555+
locale::__imp::classic_locale_imp_.__emplace(1u);
556+
return &locale::__imp::classic_locale_imp_.__get();
557+
}());
558+
return classic_locale.__get();
560559
}
561560

562561
locale& locale::__global() {
563-
static __no_destroy<locale> g(locale::classic());
564-
return g.get();
562+
static __no_destroy<locale> g(locale::classic());
563+
return g.__get();
565564
}
566565

567-
locale::locale() noexcept
568-
: __locale_(__global().__locale_)
569-
{
570-
__locale_->__add_shared();
566+
void locale::__imp::acquire() {
567+
if (this != &locale::__imp::classic_locale_imp_.__get())
568+
__add_shared();
571569
}
572570

573-
locale::locale(const locale& l) noexcept
574-
: __locale_(l.__locale_)
575-
{
576-
__locale_->__add_shared();
571+
void locale::__imp::release() {
572+
if (this != &locale::__imp::classic_locale_imp_.__get())
573+
__release_shared();
577574
}
578575

579-
locale::~locale()
580-
{
581-
__locale_->__release_shared();
582-
}
576+
locale::locale() noexcept : __locale_(__global().__locale_) { __locale_->acquire(); }
577+
578+
locale::locale(const locale& l) noexcept : __locale_(l.__locale_) { __locale_->acquire(); }
579+
580+
locale::~locale() { __locale_->release(); }
583581

584582
const locale&
585583
locale::operator=(const locale& other) noexcept
586584
{
587-
other.__locale_->__add_shared();
588-
__locale_->__release_shared();
589-
__locale_ = other.__locale_;
590-
return *this;
585+
other.__locale_->acquire();
586+
__locale_->release();
587+
__locale_ = other.__locale_;
588+
return *this;
591589
}
592590

593591
locale::locale(const char* name)
594592
: __locale_(name ? new __imp(name)
595593
: (__throw_runtime_error("locale constructed with null"), nullptr))
596594
{
597-
__locale_->__add_shared();
595+
__locale_->acquire();
598596
}
599597

600-
locale::locale(const string& name)
601-
: __locale_(new __imp(name))
602-
{
603-
__locale_->__add_shared();
604-
}
598+
locale::locale(const string& name) : __locale_(new __imp(name)) { __locale_->acquire(); }
605599

606600
locale::locale(const locale& other, const char* name, category c)
607601
: __locale_(name ? new __imp(*other.__locale_, name, c)
608602
: (__throw_runtime_error("locale constructed with null"), nullptr))
609603
{
610-
__locale_->__add_shared();
604+
__locale_->acquire();
611605
}
612606

613607
locale::locale(const locale& other, const string& name, category c)
614608
: __locale_(new __imp(*other.__locale_, name, c))
615609
{
616-
__locale_->__add_shared();
610+
__locale_->acquire();
617611
}
618612

619613
locale::locale(const locale& other, const locale& one, category c)
620614
: __locale_(new __imp(*other.__locale_, *one.__locale_, c))
621615
{
622-
__locale_->__add_shared();
616+
__locale_->acquire();
623617
}
624618

625619
string
@@ -635,7 +629,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
635629
__locale_ = new __imp(*other.__locale_, f, id);
636630
else
637631
__locale_ = other.__locale_;
638-
__locale_->__add_shared();
632+
__locale_->acquire();
639633
}
640634

641635
locale

0 commit comments

Comments
 (0)