Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
8290688: Optimize x86_64 nmethod entry barriers
Reviewed-by: kvn, rrich
  • Loading branch information
fisk committed Jul 22, 2022
1 parent 54854d9 commit b28f9da
Show file tree
Hide file tree
Showing 16 changed files with 161 additions and 15 deletions.
2 changes: 2 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Expand Up @@ -28,6 +28,8 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/arm/c2_MacroAssembler_arm.hpp
Expand Up @@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

// Compare char[] arrays aligned to 4 bytes.
void char_arrays_equals(Register ary1, Register ary2,
Register limit, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/ppc/c2_MacroAssembler_ppc.hpp
Expand Up @@ -28,6 +28,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

// Intrinsics for CompactStrings
// Compress char[] to byte[] by compressing 16 bytes at once.
void string_compress_16(Register src, Register dst, Register cnt,
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.hpp
Expand Up @@ -36,6 +36,8 @@
VectorRegister vrs,
bool is_latin, Label& DONE);
public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

void string_compare(Register str1, Register str2,
Register cnt1, Register cnt2, Register result,
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/s390/c2_MacroAssembler_s390.hpp
Expand Up @@ -29,6 +29,9 @@
// C2_MacroAssembler contains high-level macros for C2

public:
void emit_entry_barrier_stub(C2EntryBarrierStub* stub) {}
static int entry_barrier_stub_size() { return 0; }

//-------------------------------------------
// Special String Intrinsics Implementation.
//-------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/x86/c1_MacroAssembler_x86.cpp
Expand Up @@ -325,7 +325,8 @@ void C1_MacroAssembler::build_frame(int frame_size_in_bytes, int bang_size_in_by
decrement(rsp, frame_size_in_bytes); // does not emit code for frame_size == 0

BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
// C1 code is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
}


Expand Down
31 changes: 30 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Expand Up @@ -30,6 +30,7 @@
#include "oops/methodData.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/intrinsicnode.hpp"
#include "opto/output.hpp"
#include "opto/opcodes.hpp"
#include "opto/subnode.hpp"
#include "runtime/objectMonitor.hpp"
Expand Down Expand Up @@ -128,10 +129,38 @@ void C2_MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool

if (!is_stub) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(this);
#ifdef _LP64
if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) {
// We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
Label dummy_slow_path;
Label dummy_continuation;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation;
if (!Compile::current()->output()->in_scratch_emit_size()) {
// Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
}
bs->nmethod_entry_barrier(this, slow_path, continuation);
}
#else
// Don't bother with out-of-line nmethod entry barrier stub for x86_32.
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */);
#endif
}
}

void C2_MacroAssembler::emit_entry_barrier_stub(C2EntryBarrierStub* stub) {
bind(stub->slow_path());
call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
jmp(stub->continuation(), false /* maybe_short */);
}

int C2_MacroAssembler::entry_barrier_stub_size() {
return 10;
}

inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
switch (vlen_in_bytes) {
case 4: // fall-through
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Expand Up @@ -31,6 +31,9 @@
// C2 compiled method's prolog code.
void verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub);

void emit_entry_barrier_stub(C2EntryBarrierStub* stub);
static int entry_barrier_stub_size();

Assembler::AvxVectorLen vector_length_encoding(int vlen_in_bytes);

// Code used by cmpFastLock and cmpFastUnlock mach instructions in .ad file.
Expand Down
26 changes: 19 additions & 7 deletions src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.cpp
Expand Up @@ -309,22 +309,34 @@ void BarrierSetAssembler::incr_allocated_bytes(MacroAssembler* masm, Register th
}

#ifdef _LP64
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
}
Label continuation;
Register thread = r15_thread;
Address disarmed_addr(thread, in_bytes(bs_nm->thread_disarmed_offset()));
__ align(8);
// The immediate is the last 4 bytes, so if we align the start of the cmp
// instruction to 4 bytes, we know that the second half of it is also 4
// byte aligned, which means that the immediate will not cross a cache line
__ align(4);
uintptr_t before_cmp = (uintptr_t)__ pc();
__ cmpl(disarmed_addr, 0);
__ jcc(Assembler::equal, continuation);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(continuation);
uintptr_t after_cmp = (uintptr_t)__ pc();
guarantee(after_cmp - before_cmp == 8, "Wrong assumed instruction length");

if (slow_path != NULL) {
__ jcc(Assembler::notEqual, *slow_path);
__ bind(*continuation);
} else {
Label done;
__ jccb(Assembler::equal, done);
__ call(RuntimeAddress(StubRoutines::x86::method_entry_barrier()));
__ bind(done);
}
}
#else
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm) {
void BarrierSetAssembler::nmethod_entry_barrier(MacroAssembler* masm, Label*, Label*) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
if (bs_nm == NULL) {
return;
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/cpu/x86/gc/shared/barrierSetAssembler_x86.hpp
Expand Up @@ -68,7 +68,7 @@ class BarrierSetAssembler: public CHeapObj<mtGC> {

virtual void barrier_stubs_init() {}

virtual void nmethod_entry_barrier(MacroAssembler* masm);
virtual void nmethod_entry_barrier(MacroAssembler* masm, Label* slow_path, Label* continuation);
virtual void c2i_entry_barrier(MacroAssembler* masm);
};

Expand Down
17 changes: 14 additions & 3 deletions src/hotspot/cpu/x86/gc/shared/barrierSetNMethod_x86.cpp
Expand Up @@ -32,6 +32,7 @@
#include "runtime/sharedRuntime.hpp"
#include "utilities/align.hpp"
#include "utilities/debug.hpp"
#include "utilities/macros.hpp"

class NativeNMethodCmpBarrier: public NativeInstruction {
public:
Expand Down Expand Up @@ -62,7 +63,7 @@ class NativeNMethodCmpBarrier: public NativeInstruction {

#ifdef _LP64
void NativeNMethodCmpBarrier::verify() const {
if (((uintptr_t) instruction_address()) & 0x7) {
if (((uintptr_t) instruction_address()) & 0x3) {
fatal("Not properly aligned");
}

Expand Down Expand Up @@ -156,10 +157,20 @@ void BarrierSetNMethod::deoptimize(nmethod* nm, address* return_address_ptr) {
// NativeNMethodCmpBarrier::verify() will immediately complain when it does
// not find the expected native instruction at this offset, which needs updating.
// Note that this offset is invariant of PreserveFramePointer.
static const int entry_barrier_offset = LP64_ONLY(-19) NOT_LP64(-18);
static const int entry_barrier_offset(nmethod* nm) {
#ifdef _LP64
if (nm->is_compiled_by_c2()) {
return -14;
} else {
return -15;
}
#else
return -18;
#endif
}

static NativeNMethodCmpBarrier* native_nmethod_barrier(nmethod* nm) {
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset;
address barrier_address = nm->code_begin() + nm->frame_complete_offset() + entry_barrier_offset(nm);
NativeNMethodCmpBarrier* barrier = reinterpret_cast<NativeNMethodCmpBarrier*>(barrier_address);
debug_only(barrier->verify());
return barrier;
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/cpu/x86/sharedRuntime_x86_32.cpp
Expand Up @@ -1518,7 +1518,7 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,


BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);

// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;
Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/x86/sharedRuntime_x86_64.cpp
Expand Up @@ -1744,7 +1744,8 @@ nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
__ subptr(rsp, stack_size - 2*wordSize);

BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->nmethod_entry_barrier(masm);
// native wrapper is not hot enough to micro optimize the nmethod entry barrier with an out-of-line stub
bs->nmethod_entry_barrier(masm, NULL /* slow_path */, NULL /* continuation */);

// Frame is now completed as far as size and linkage.
int frame_complete = ((intptr_t)__ pc()) - start;
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/c2_MacroAssembler.hpp
Expand Up @@ -29,6 +29,8 @@
#include "asm/macroAssembler.inline.hpp"
#include "utilities/macros.hpp"

class C2EntryBarrierStub;

class C2_MacroAssembler: public MacroAssembler {
public:
// creation
Expand Down
45 changes: 45 additions & 0 deletions src/hotspot/share/opto/output.cpp
Expand Up @@ -39,6 +39,7 @@
#include "opto/ad.hpp"
#include "opto/block.hpp"
#include "opto/c2compiler.hpp"
#include "opto/c2_MacroAssembler.hpp"
#include "opto/callnode.hpp"
#include "opto/cfgnode.hpp"
#include "opto/locknode.hpp"
Expand Down Expand Up @@ -284,12 +285,51 @@ int C2SafepointPollStubTable::estimate_stub_size() const {
return result;
}

// Nmethod entry barrier stubs
C2EntryBarrierStub* C2EntryBarrierStubTable::add_entry_barrier() {
assert(_stub == NULL, "There can only be one entry barrier stub");
_stub = new (Compile::current()->comp_arena()) C2EntryBarrierStub();
return _stub;
}

void C2EntryBarrierStubTable::emit(CodeBuffer& cb) {
if (_stub == NULL) {
// No stub - nothing to do
return;
}

C2_MacroAssembler masm(&cb);
// Make sure there is enough space in the code buffer
if (cb.insts()->maybe_expand_to_ensure_remaining(PhaseOutput::MAX_inst_size) && cb.blob() == NULL) {
ciEnv::current()->record_failure("CodeCache is full");
return;
}

intptr_t before = masm.offset();
masm.emit_entry_barrier_stub(_stub);
intptr_t after = masm.offset();
int actual_size = (int)(after - before);
int expected_size = masm.entry_barrier_stub_size();
assert(actual_size == expected_size, "Estimated size is wrong, expected %d, was %d", expected_size, actual_size);
}

int C2EntryBarrierStubTable::estimate_stub_size() const {
if (BarrierSet::barrier_set()->barrier_set_nmethod() == NULL) {
// No nmethod entry barrier?
return 0;
}

return C2_MacroAssembler::entry_barrier_stub_size();
}

PhaseOutput::PhaseOutput()
: Phase(Phase::Output),
_code_buffer("Compile::Fill_buffer"),
_first_block_size(0),
_handler_table(),
_inc_table(),
_safepoint_poll_table(),
_entry_barrier_table(),
_oop_map_set(NULL),
_scratch_buffer_blob(NULL),
_scratch_locs_memory(NULL),
Expand Down Expand Up @@ -1302,6 +1342,7 @@ CodeBuffer* PhaseOutput::init_buffer() {
BarrierSetC2* bs = BarrierSet::barrier_set()->barrier_set_c2();
stub_req += bs->estimate_stub_size();
stub_req += safepoint_poll_table()->estimate_stub_size();
stub_req += entry_barrier_table()->estimate_stub_size();

// nmethod and CodeBuffer count stubs & constants as part of method's code.
// class HandlerImpl is platform-specific and defined in the *.ad files.
Expand Down Expand Up @@ -1812,6 +1853,10 @@ void PhaseOutput::fill_buffer(CodeBuffer* cb, uint* blk_starts) {
safepoint_poll_table()->emit(*cb);
if (C->failing()) return;

// Fill in stubs for calling the runtime from nmethod entries.
entry_barrier_table()->emit(*cb);
if (C->failing()) return;

#ifndef PRODUCT
// Information on the size of the method, without the extraneous code
Scheduling::increment_method_size(cb->insts_size());
Expand Down
29 changes: 29 additions & 0 deletions src/hotspot/share/opto/output.hpp
Expand Up @@ -40,6 +40,7 @@ class Arena;
class Bundle;
class Block;
class Block_Array;
class C2_MacroAssembler;
class ciMethod;
class Compile;
class MachNode;
Expand Down Expand Up @@ -113,6 +114,30 @@ class C2SafepointPollStubTable {
void emit(CodeBuffer& cb);
};

// We move non-hot code of the nmethod entry barrier to an out-of-line stub
class C2EntryBarrierStub: public ResourceObj {
Label _slow_path;
Label _continuation;

public:
C2EntryBarrierStub() :
_slow_path(),
_continuation() {}

Label& slow_path() { return _slow_path; }
Label& continuation() { return _continuation; }
};

class C2EntryBarrierStubTable {
C2EntryBarrierStub* _stub;

public:
C2EntryBarrierStubTable() : _stub(NULL) {}
C2EntryBarrierStub* add_entry_barrier();
int estimate_stub_size() const;
void emit(CodeBuffer& cb);
};

class PhaseOutput : public Phase {
private:
// Instruction bits passed off to the VM
Expand All @@ -122,6 +147,7 @@ class PhaseOutput : public Phase {
ExceptionHandlerTable _handler_table; // Table of native-code exception handlers
ImplicitExceptionTable _inc_table; // Table of implicit null checks in native code
C2SafepointPollStubTable _safepoint_poll_table;// Table for safepoint polls
C2EntryBarrierStubTable _entry_barrier_table; // Table for entry barrier stubs
OopMapSet* _oop_map_set; // Table of oop maps (one for each safepoint location)
BufferBlob* _scratch_buffer_blob; // For temporary code buffers.
relocInfo* _scratch_locs_memory; // For temporary code buffers.
Expand Down Expand Up @@ -172,6 +198,9 @@ class PhaseOutput : public Phase {
// Safepoint poll table
C2SafepointPollStubTable* safepoint_poll_table() { return &_safepoint_poll_table; }

// Entry barrier table
C2EntryBarrierStubTable* entry_barrier_table() { return &_entry_barrier_table; }

// Code emission iterator
Block* block() { return _block; }
int index() { return _index; }
Expand Down

1 comment on commit b28f9da

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.