-
Notifications
You must be signed in to change notification settings - Fork 10.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[libc++] Speed up classic locale #72112
Conversation
@llvm/pr-subscribers-libcxx Author: Dmitry Vyukov (dvyukov) ChangesLocale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads.
Full diff: https://github.com/llvm/llvm-project/pull/72112.diff 2 Files Affected:
diff --git a/libcxx/benchmarks/stringstream.bench.cpp b/libcxx/benchmarks/stringstream.bench.cpp
index ea602557ccd770e..c10ee3a8cc5b83c 100644
--- a/libcxx/benchmarks/stringstream.bench.cpp
+++ b/libcxx/benchmarks/stringstream.bench.cpp
@@ -1,11 +1,12 @@
#include "benchmark/benchmark.h"
#include "test_macros.h"
+#include <mutex>
#include <sstream>
TEST_NOINLINE double istream_numbers();
-double istream_numbers() {
+double istream_numbers(std::locale* l) {
const char* a[] = {"-6 69 -71 2.4882e-02 -100 101 -2.00005 5000000 -50000000",
"-25 71 7 -9.3262e+01 -100 101 -2.00005 5000000 -50000000",
"-14 53 46 -6.7026e-02 -100 101 -2.00005 5000000 -50000000"};
@@ -14,17 +15,72 @@ double istream_numbers() {
double f1 = 0.0, f2 = 0.0, q = 0.0;
for (int i = 0; i < 3; i++) {
std::istringstream s(a[i]);
+ if (l)
+ s.imbue(*l);
s >> a1 >> a2 >> a3 >> f1 >> a4 >> a5 >> f2 >> a6 >> a7;
q += (a1 + a2 + a3 + a4 + a5 + a6 + a7 + f1 + f2) / 1000000;
}
return q;
}
+struct LocaleSelector {
+ std::locale* imbue;
+ std::locale old;
+
+ LocaleSelector(benchmark::State& state) {
+ static std::mutex mu;
+ std::lock_guard l(mu);
+ switch (state.range(0)) {
+ case 0: {
+ old = std::locale::global(std::locale::classic());
+ imbue = nullptr;
+ break;
+ }
+ case 1: {
+ old = std::locale::global(std::locale::classic());
+ thread_local std::locale l("en_US.UTF-8");
+ imbue = &l;
+ break;
+ }
+ case 2: {
+ old = std::locale::global(std::locale::classic());
+ static std::locale l("en_US.UTF-8");
+ imbue = &l;
+ break;
+ }
+ case 3: {
+ old = std::locale::global(std::locale("en_US.UTF-8"));
+ imbue = nullptr;
+ break;
+ }
+ }
+ }
+
+ ~LocaleSelector() {
+ static std::mutex mu;
+ std::lock_guard l(mu);
+ std::locale::global(old);
+ }
+};
+
static void BM_Istream_numbers(benchmark::State& state) {
+ LocaleSelector sel(state);
double i = 0;
while (state.KeepRunning())
- benchmark::DoNotOptimize(i += istream_numbers());
+ benchmark::DoNotOptimize(i += istream_numbers(sel.imbue));
+}
+BENCHMARK(BM_Istream_numbers)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
+
+static void BM_Ostream_number(benchmark::State& state) {
+ LocaleSelector sel(state);
+ while (state.KeepRunning()) {
+ std::ostringstream ss;
+ if (sel.imbue)
+ ss.imbue(*sel.imbue);
+ ss << 0;
+ benchmark::DoNotOptimize(ss.str().c_str());
+ }
}
+BENCHMARK(BM_Ostream_number)->DenseRange(0, 3)->UseRealTime()->Threads(1)->ThreadPerCpu();
-BENCHMARK(BM_Istream_numbers)->RangeMultiplier(2)->Range(1024, 4096);
BENCHMARK_MAIN();
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index c37e091dcc4671b..58b2d6c33606ba9 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -8,6 +8,7 @@
#include <__utility/unreachable.h>
#include <algorithm>
+#include <atomic>
#include <clocale>
#include <codecvt>
#include <cstddef>
@@ -80,7 +81,7 @@ locale_t __cloc() {
namespace {
-struct release
+struct releaser
{
void operator()(locale::facet* p) {p->__release_shared();}
};
@@ -154,12 +155,16 @@ class _LIBCPP_HIDDEN locale::__imp
{return static_cast<size_t>(id) < facets_.size() && facets_[static_cast<size_t>(id)];}
const locale::facet* use_facet(long id) const;
+ void acquire();
+ void release();
+
static const locale& make_classic();
static locale& make_global();
private:
void install(facet* f, long id);
template <class F> void install(F* f) {install(f, f->id.__get());}
template <class F> void install_from(const __imp& other);
+ static std::atomic<__imp*> classic_;
};
locale::__imp::__imp(size_t refs)
@@ -501,7 +506,7 @@ locale::__imp::__imp(const __imp& other, facet* f, long id)
name_("*")
{
f->__add_shared();
- unique_ptr<facet, release> hold(f);
+ unique_ptr<facet, releaser> hold(f);
facets_ = other.facets_;
for (unsigned i = 0; i < other.facets_.size(); ++i)
if (facets_[i])
@@ -520,7 +525,7 @@ void
locale::__imp::install(facet* f, long id)
{
f->__add_shared();
- unique_ptr<facet, release> hold(f);
+ unique_ptr<facet, releaser> hold(f);
if (static_cast<size_t>(id) >= facets_.size())
facets_.resize(static_cast<size_t>(id+1));
if (facets_[static_cast<size_t>(id)])
@@ -538,6 +543,8 @@ locale::__imp::use_facet(long id) const
// locale
+std::atomic<locale::__imp*> locale::__imp::classic_;
+
const locale&
locale::__imp::make_classic()
{
@@ -545,9 +552,22 @@ locale::__imp::make_classic()
alignas(locale) static std::byte buf[sizeof(locale)];
locale* c = reinterpret_cast<locale*>(&buf);
c->__locale_ = &make<__imp>(1u);
+ classic_.store(c->__locale_, std::memory_order_relaxed);
return *c;
}
+void locale::__imp::acquire()
+{
+ if (this != classic_.load(std::memory_order_relaxed))
+ __add_shared();
+}
+
+void locale::__imp::release()
+{
+ if (this != classic_.load(std::memory_order_relaxed))
+ __release_shared();
+}
+
const locale&
locale::classic()
{
@@ -574,25 +594,25 @@ locale::__global()
locale::locale() noexcept
: __locale_(__global().__locale_)
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::locale(const locale& l) noexcept
: __locale_(l.__locale_)
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::~locale()
{
- __locale_->__release_shared();
+ __locale_->release();
}
const locale&
locale::operator=(const locale& other) noexcept
{
- other.__locale_->__add_shared();
- __locale_->__release_shared();
+ other.__locale_->acquire();
+ __locale_->release();
__locale_ = other.__locale_;
return *this;
}
@@ -601,32 +621,32 @@ locale::locale(const char* name)
: __locale_(name ? new __imp(name)
: (__throw_runtime_error("locale constructed with null"), nullptr))
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::locale(const string& name)
: __locale_(new __imp(name))
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::locale(const locale& other, const char* name, category c)
: __locale_(name ? new __imp(*other.__locale_, name, c)
: (__throw_runtime_error("locale constructed with null"), nullptr))
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::locale(const locale& other, const string& name, category c)
: __locale_(new __imp(*other.__locale_, name, c))
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale::locale(const locale& other, const locale& one, category c)
: __locale_(new __imp(*other.__locale_, *one.__locale_, c))
{
- __locale_->__add_shared();
+ __locale_->acquire();
}
string
@@ -642,7 +662,7 @@ locale::__install_ctor(const locale& other, facet* f, long id)
__locale_ = new __imp(*other.__locale_, f, id);
else
__locale_ = other.__locale_;
- __locale_->__add_shared();
+ __locale_->acquire();
}
locale
|
This is an alternative version of #70631 (without inlining of ctor/dtor). |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the patch! I had a short look and see some things I want to look at later this week in more detail.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please take another look.
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads. │ baseline │ optimized │ │ sec/op │ sec/op vs base │ Istream_numbers/0/threads:1 4.672µ ± 0% 4.419µ ± 0% -5.42% (p=0.000 n=30+39) Istream_numbers/0/threads:72 539.817µ ± 0% 9.842µ ± 1% -98.18% (p=0.000 n=30+40) Istream_numbers/1/threads:1 4.890µ ± 0% 4.750µ ± 0% -2.85% (p=0.000 n=30+40) Istream_numbers/1/threads:72 66.44µ ± 1% 10.14µ ± 1% -84.74% (p=0.000 n=30+40) Istream_numbers/2/threads:1 4.888µ ± 0% 4.746µ ± 0% -2.92% (p=0.000 n=30+40) Istream_numbers/2/threads:72 494.8µ ± 0% 410.2µ ± 1% -17.11% (p=0.000 n=30+40) Istream_numbers/3/threads:1 4.697µ ± 0% 4.695µ ± 5% ~ (p=0.391 n=30+37) Istream_numbers/3/threads:72 421.5µ ± 7% 421.9µ ± 9% ~ (p=0.665 n=30) Ostream_number/0/threads:1 183.0n ± 0% 141.0n ± 2% -22.95% (p=0.000 n=30) Ostream_number/0/threads:72 24196.5n ± 1% 343.5n ± 3% -98.58% (p=0.000 n=30) Ostream_number/1/threads:1 250.0n ± 0% 196.0n ± 2% -21.60% (p=0.000 n=30) Ostream_number/1/threads:72 16260.5n ± 0% 407.0n ± 2% -97.50% (p=0.000 n=30) Ostream_number/2/threads:1 254.0n ± 0% 196.0n ± 1% -22.83% (p=0.000 n=30) Ostream_number/2/threads:72 28.49µ ± 1% 18.89µ ± 5% -33.72% (p=0.000 n=30) Ostream_number/3/threads:1 185.0n ± 0% 185.0n ± 0% 0.00% (p=0.017 n=30) Ostream_number/3/threads:72 19.38µ ± 4% 19.33µ ± 5% ~ (p=0.425 n=30)
c38f931
to
3a4039d
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
I agree this is better. The change looks good to me. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This LGTM in the current state.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great to see we can do it without the atomics!
In general happy, but there is one issue and two nits.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Please address @ldionne's comments before merging.
Should I squash and merge this? Or what's the next step? There are some test failures, but they don't look related:
|
I've re-run I am going to merge it now. Let's see if it sticks.
|
This caused the following error on bots. @ldionne perhaps classic_locale_imp_ should be constinit? https://lab.llvm.org/buildbot/#/builders/168/builds/17053
|
Thanks for the heads up. See #73533 |
Looks like it broke the ASAN build: https://lab.llvm.org/buildbot/#/builders/168/builds/17053/steps/9/logs/stdio This reverts commit f8afc53.
@dvyukov FYI... |
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads. This is a re-application of f8afc53 (aka PR llvm#72112) which was reverted in 4e0c48b because it broke the sanitizer builds due to an initialization order fiasco. This issue has now been fixed by ensuring that the locale is constinit'ed. Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads. Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads. This is a re-application of f8afc53 (aka PR #72112) which was reverted in 4e0c48b because it broke the sanitizer builds due to an initialization order fiasco. This issue has now been fixed by ensuring that the locale is constinit'ed. Co-authored-by: Dmitry Vyukov <dvyukov@google.com>
Locale objects use atomic reference counting, which may be very expensive in parallel applications. The classic locale is used by default by all streams and can be very contended. But it's never destroyed, so the reference counting is also completely pointless on the classic locale. Currently ~70% of time in the parallel stringstream benchmarks is spent in locale ctor/dtor. And the execution radically slows down with more threads.
Avoid reference counting on the classic locale. With this change parallel benchmarks start to scale with threads.