Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8261027: AArch64: Support for LSE atomics C++ HotSpot code #2434

Closed
wants to merge 16 commits into from
@@ -0,0 +1,46 @@
/* Copyright (c) 2021, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#ifndef CPU_AARCH64_ATOMIC_AARCH64_HPP
#define CPU_AARCH64_ATOMIC_AARCH64_HPP

// Atomic stub implementation.
// Default implementations are in atomic_linux_aarch64.S
//
// All stubs pass arguments the same way
// x0: src/dest address
// x1: arg1
// x2: arg2 (optional)
// x3, x8, x9: scratch
typedef uint64_t (*aarch64_atomic_stub_t)(volatile void *ptr, uint64_t arg1, uint64_t arg2);

// Pointers to stubs
extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_fetch_add_8_impl;
extern aarch64_atomic_stub_t aarch64_atomic_xchg_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_xchg_8_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_1_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_4_impl;
extern aarch64_atomic_stub_t aarch64_atomic_cmpxchg_8_impl;

#endif // CPU_AARCH64_ATOMIC_AARCH64_HPP
@@ -2566,6 +2566,8 @@ void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) {

ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)

@@ -1039,6 +1039,8 @@ class MacroAssembler: public Assembler {

void atomic_xchg(Register prev, Register newv, Register addr);
void atomic_xchgw(Register prev, Register newv, Register addr);
void atomic_xchgl(Register prev, Register newv, Register addr);
void atomic_xchglw(Register prev, Register newv, Register addr);
void atomic_xchgal(Register prev, Register newv, Register addr);
void atomic_xchgalw(Register prev, Register newv, Register addr);

@@ -26,6 +26,7 @@
#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "atomic_aarch64.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/gc_globals.hpp"
@@ -38,6 +39,7 @@
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/atomic.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
@@ -1361,7 +1363,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects:
// disjoint_int_copy_entry is set to the no-overlap entry point
@@ -1431,7 +1433,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
address *entry, const char *name,
@@ -1596,7 +1598,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
// Side Effects:
// disjoint_int_copy_entry is set to the no-overlap entry point
@@ -1620,7 +1622,7 @@ class StubGenerator: public StubCodeGenerator {
//
// If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
// the hardware handle it. The two dwords within qwords that span
// cache line boundaries will still be loaded and stored atomicly.
// cache line boundaries will still be loaded and stored atomically.
//
address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
address *entry, const char *name,
@@ -5571,6 +5573,91 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

#ifdef LINUX
// ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX.
//
// If LSE is in use, generate LSE versions of all the stubs. The
// non-LSE versions are in atomic_aarch64.S.
This conversation was marked as resolved by theRealAph

This comment has been minimized.

@nick-arm

nick-arm Feb 8, 2021
Member

IMO it would be better for maintainability if the LSE versions were in atomic_aarch64.S too (with an explicit .arch armv8-a+lse directive). Is there any reason to generate them here, other than to support old toolchains? As far as I can tell GNU as supported LSE as far back as binutils 2.27.

https://sourceware.org/binutils/docs-2.27/as/AArch64-Extensions.html

This comment has been minimized.

@theRealAph

theRealAph Feb 8, 2021
Author Contributor

I can't see any reason to do this.There's be no benefit to moving this stuff, and it would be harder to change in the future. I'd do the whole lot as runtime stubs if I could, but they're needed before VM startup.

This comment has been minimized.

@theRealAph

theRealAph Feb 9, 2021
Author Contributor

And I should also have said: I intend to do highly-optimized versions of the LSE atomics in a subsequent PR, and I'd much prefer to do the work internally within HotSpot.

void generate_atomic_entry_points() {

if (! UseLSE) {
return;
}

__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "atomic entry points");

__ align(32);
aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
__ atomic_addal(prev, incr, addr);
__ mov(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, incr = c_rarg1;
__ atomic_addalw(prev, incr, addr);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, newv = c_rarg1;
__ atomic_xchglw(prev, newv, addr);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r2, addr = c_rarg0, newv = c_rarg1;
__ atomic_xchgl(prev, newv, addr);
__ mov(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::byte,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::word,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ movw(r0, prev);
__ ret(lr);
}
__ align(32);
aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)__ pc();
{
Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
exchange_val = c_rarg2;
__ cmpxchg(ptr, compare_val, exchange_val,
MacroAssembler::xword,
/*acquire*/false, /*release*/false, /*weak*/false,
prev);
__ mov(r0, prev);
__ ret(lr);
}
}
#endif // LINUX

// Continuation point for throwing of implicit exceptions that are
// not handled in the current activation. Fabricates an exception
// oop and initiates normal exception dispatching in this
@@ -6683,6 +6770,12 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
}

#ifdef LINUX

generate_atomic_entry_points();

#endif // LINUX

StubRoutines::aarch64::set_completed();
}

@@ -6703,3 +6796,27 @@ void StubGenerator_generate(CodeBuffer* code, bool all) {
}
StubGenerator g(code, all);
}


#ifdef LINUX

// Define pointers to atomic stubs and initialize them to point to the
// code in atomic_aarch64.S.

#define DEFAULT_ATOMIC_OP(OPNAME, SIZE) \
extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl \
(volatile void *ptr, uint64_t arg1, uint64_t arg2); \
aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _impl \
= aarch64_atomic_ ## OPNAME ## _ ## SIZE ## _default_impl;

DEFAULT_ATOMIC_OP(fetch_add, 4)
DEFAULT_ATOMIC_OP(fetch_add, 8)
DEFAULT_ATOMIC_OP(xchg, 4)
DEFAULT_ATOMIC_OP(xchg, 8)
DEFAULT_ATOMIC_OP(cmpxchg, 1)
DEFAULT_ATOMIC_OP(cmpxchg, 4)
DEFAULT_ATOMIC_OP(cmpxchg, 8)

#undef DEFAULT_ATOMIC_OP

#endif // LINUX
@@ -0,0 +1,96 @@
// Copyright (c) 2021, Red Hat Inc. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.

// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.

// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).

// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.

// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.



.text

.globl aarch64_atomic_fetch_add_8_default_impl
.align 5
aarch64_atomic_fetch_add_8_default_impl:
0: ldaxr x2, [x0]
add x8, x2, x1
stlxr w9, x8, [x0]
cbnz w9, 0b
mov x0, x2
ret

.globl aarch64_atomic_fetch_add_4_default_impl
.align 5
aarch64_atomic_fetch_add_4_default_impl:
0: ldaxr w2, [x0]
add w8, w2, w1
stlxr w9, w8, [x0]
cbnz w9, 0b
mov w0, w2
ret

.globl aarch64_atomic_xchg_4_default_impl
.align 5
aarch64_atomic_xchg_4_default_impl:
0: ldaxr w2, [x0]
stlxr w8, w1, [x0]
cbnz w8, 0b
mov w0, w2
ret

.globl aarch64_atomic_xchg_8_default_impl
.align 5
aarch64_atomic_xchg_8_default_impl:
0: ldaxr x2, [x0]
stlxr w8, x1, [x0]
cbnz w8, 0b
mov x0, x2
ret

.globl aarch64_atomic_cmpxchg_1_default_impl
.align 5
aarch64_atomic_cmpxchg_1_default_impl:
0: ldxrb w3, [x0]
eor w8, w3, w1
tst x8, #0xff
b.ne 1f
stxrb w8, w2, [x0]
cbnz w8, 0b
1: mov w0, w3
ret

.globl aarch64_atomic_cmpxchg_4_default_impl
.align 5
aarch64_atomic_cmpxchg_4_default_impl:
0: ldxr w3, [x0]
cmp w3, w1
b.ne 1f
stxr w8, w2, [x0]
cbnz w8, 0b
1: mov w0, w3
ret

.globl aarch64_atomic_cmpxchg_8_default_impl
.align 5
aarch64_atomic_cmpxchg_8_default_impl:
0: ldxr x3, [x0]
cmp x3, x1
b.ne 1f
stxr w8, x2, [x0]
cbnz w8, 0b
1: mov x0, x3
ret
ProTip! Use n and p to navigate between commits in a pull request.