Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 8 additions & 15 deletions src/hotspot/cpu/aarch64/aarch64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2205,14 +2205,14 @@ void MachUEPNode::format(PhaseRegAlloc* ra_, outputStream* st) const
{
st->print_cr("# MachUEPNode");
if (UseCompressedClassPointers) {
st->print_cr("\tldrw rscratch1, j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
if (CompressedKlassPointers::shift() != 0) {
st->print_cr("\tdecode_klass_not_null rscratch1, rscratch1");
}
st->print_cr("\tldrw rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
st->print_cr("\tldrw r10, [rscratch2 + CompiledICData::speculated_klass_offset()]\t# compressed klass");
st->print_cr("\tcmpw rscratch1, r10");
} else {
st->print_cr("\tldr rscratch1, j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
st->print_cr("\tldr rscratch1, [j_rarg0 + oopDesc::klass_offset_in_bytes()]\t# compressed klass");
st->print_cr("\tldr r10, [rscratch2 + CompiledICData::speculated_klass_offset()]\t# compressed klass");
st->print_cr("\tcmp rscratch1, r10");
}
st->print_cr("\tcmp r0, rscratch1\t # Inline cache check");
st->print_cr("\tbne, SharedRuntime::_ic_miss_stub");
}
#endif
Expand All @@ -2221,14 +2221,7 @@ void MachUEPNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const
{
// This is the unverified entry point.
C2_MacroAssembler _masm(&cbuf);

__ cmp_klass(j_rarg0, rscratch2, rscratch1);
Label skip;
// TODO
// can we avoid this skip and still use a reloc?
__ br(Assembler::EQ, skip);
__ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
__ bind(skip);
__ ic_check(InteriorEntryAlignment);
}

uint MachUEPNode::size(PhaseRegAlloc* ra_) const
Expand Down Expand Up @@ -3715,7 +3708,7 @@ encode %{
cbuf.shared_stub_to_interp_for(_method, call - cbuf.insts_begin());
} else {
// Emit stub for static call
address stub = CompiledStaticCall::emit_to_interp_stub(cbuf, call);
address stub = CompiledDirectCall::emit_to_interp_stub(cbuf, call);
if (stub == nullptr) {
ciEnv::current()->record_failure("CodeCache is full");
return;
Expand Down
25 changes: 2 additions & 23 deletions src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
#endif

NEEDS_CLEANUP // remove this definitions ?
const Register IC_Klass = rscratch2; // where the IC klass is cached
const Register SYNC_header = r0; // synchronization header
const Register SHIFT_count = r0; // where count for shift operations must be

Expand Down Expand Up @@ -293,27 +292,7 @@ void LIR_Assembler::osr_entry() {

// inline cache check; done before the frame is built.
int LIR_Assembler::check_icache() {
Register receiver = FrameMap::receiver_opr->as_register();
Register ic_klass = IC_Klass;
int start_offset = __ offset();
__ inline_cache_check(receiver, ic_klass);

// if icache check fails, then jump to runtime routine
// Note: RECEIVER must still contain the receiver!
Label dont;
__ br(Assembler::EQ, dont);
__ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));

// We align the verified entry point unless the method body
// (including its inline cache check) will fit in a single 64-byte
// icache line.
if (! method()->is_accessor() || __ offset() - start_offset > 4 * 4) {
// force alignment after the cache check.
__ align(CodeEntryAlignment);
}

__ bind(dont);
return start_offset;
return __ ic_check(CodeEntryAlignment);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we really want to remove the optimization that skips alignment for small methods?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not obvious to me how likely this optimization is to kick in, and if it does, how many bytes it really saves. It optimizes accessor methods, when the UEP can be squeezed down to 4 instructions. The previous inline_cache_check was >= 2 instructions, there is a jump to skip the far jump, and the far_jump is >= 1 instructions. So it seems like it would kick in when far jumps can just branch and the compressed class encoding is simple enough. However, with the new ic_check we always have at least 5 instructions, sometimes 7. So it seemed to me like the intended optimization wouldn't apply any longer anyway.

}

void LIR_Assembler::clinit_barrier(ciMethod* method) {
Expand Down Expand Up @@ -2042,7 +2021,7 @@ void LIR_Assembler::emit_static_call_stub() {
__ relocate(static_stub_Relocation::spec(call_pc));
__ emit_static_call_stub();

assert(__ offset() - start + CompiledStaticCall::to_trampoline_stub_size()
assert(__ offset() - start + CompiledDirectCall::to_trampoline_stub_size()
<= call_stub_size(), "stub too big");
__ end_a_stub();
}
Expand Down
4 changes: 2 additions & 2 deletions src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ friend class ArrayCopyStub;
void deoptimize_trap(CodeEmitInfo *info);

enum {
// call stub: CompiledStaticCall::to_interp_stub_size() +
// CompiledStaticCall::to_trampoline_stub_size()
// call stub: CompiledDirectCall::to_interp_stub_size() +
// CompiledDirectCall::to_trampoline_stub_size()
_call_stub_size = 13 * NativeInstruction::instruction_size,
_exception_handler_size = DEBUG_ONLY(1*K) NOT_DEBUG(175),
_deopt_handler_size = 7 * NativeInstruction::instruction_size
Expand Down
11 changes: 0 additions & 11 deletions src/hotspot/cpu/aarch64/c1_MacroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,17 +308,6 @@ void C1_MacroAssembler::allocate_array(Register obj, Register len, Register t1,
verify_oop(obj);
}


void C1_MacroAssembler::inline_cache_check(Register receiver, Register iCache) {
verify_oop(receiver);
// explicit null check not needed since load from [klass_offset] causes a trap
// check against inline cache
assert(!MacroAssembler::needs_explicit_null_check(oopDesc::klass_offset_in_bytes()), "must add explicit null check");

cmp_klass(receiver, iCache, rscratch1);
}


void C1_MacroAssembler::build_frame(int framesize, int bang_size_in_bytes) {
assert(bang_size_in_bytes >= framesize, "stack bang size incorrect");
// Make sure there is enough stack space for this method's activation.
Expand Down
1 change: 0 additions & 1 deletion src/hotspot/cpu/aarch64/c1_Runtime1_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_aarch64.hpp"
#include "oops/compiledICHolder.hpp"
#include "oops/oop.inline.hpp"
#include "prims/jvmtiExport.hpp"
#include "register_aarch64.hpp"
Expand Down
22 changes: 7 additions & 15 deletions src/hotspot/cpu/aarch64/compiledIC_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include "precompiled.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "code/compiledIC.hpp"
#include "code/icBuffer.hpp"
#include "code/nmethod.hpp"
#include "logging/log.hpp"
#include "memory/resourceArea.hpp"
Expand All @@ -36,7 +35,7 @@
// ----------------------------------------------------------------------------

#define __ _masm.
address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark) {
address CompiledDirectCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark) {
precond(cbuf.stubs()->start() != badAddress);
precond(cbuf.stubs()->end() != badAddress);

Expand Down Expand Up @@ -71,33 +70,26 @@ address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark)
}
#undef __

int CompiledStaticCall::to_interp_stub_size() {
int CompiledDirectCall::to_interp_stub_size() {
return MacroAssembler::static_call_stub_size();
}

int CompiledStaticCall::to_trampoline_stub_size() {
int CompiledDirectCall::to_trampoline_stub_size() {
// Somewhat pessimistically, we count 3 instructions here (although
// there are only two) because we sometimes emit an alignment nop.
// Trampoline stubs are always word aligned.
return MacroAssembler::max_trampoline_stub_size();
}

// Relocation entries for call stub, compiled java to interpreter.
int CompiledStaticCall::reloc_to_interp_stub() {
int CompiledDirectCall::reloc_to_interp_stub() {
return 4; // 3 in emit_to_interp_stub + 1 in emit_call
}

void CompiledDirectStaticCall::set_to_interpreted(const methodHandle& callee, address entry) {
void CompiledDirectCall::set_to_interpreted(const methodHandle& callee, address entry) {
address stub = find_stub();
guarantee(stub != nullptr, "stub not found");

{
ResourceMark rm;
log_trace(inlinecache)("CompiledDirectStaticCall@" INTPTR_FORMAT ": set_to_interpreted %s",
p2i(instruction_address()),
callee->name_and_sig_as_C_string());
}

// Creation also verifies the object.
NativeMovConstReg* method_holder
= nativeMovConstReg_at(stub + NativeInstruction::instruction_size);
Expand All @@ -115,7 +107,7 @@ void CompiledDirectStaticCall::set_to_interpreted(const methodHandle& callee, ad
set_destination_mt_safe(stub);
}

void CompiledDirectStaticCall::set_stub_to_clean(static_stub_Relocation* static_stub) {
void CompiledDirectCall::set_stub_to_clean(static_stub_Relocation* static_stub) {
// Reset stub.
address stub = static_stub->addr();
assert(stub != nullptr, "stub not found");
Expand All @@ -132,7 +124,7 @@ void CompiledDirectStaticCall::set_stub_to_clean(static_stub_Relocation* static_
// Non-product mode code
#ifndef PRODUCT

void CompiledDirectStaticCall::verify() {
void CompiledDirectCall::verify() {
// Verify call.
_call->verify();
_call->verify_alignment();
Expand Down
82 changes: 0 additions & 82 deletions src/hotspot/cpu/aarch64/icBuffer_aarch64.cpp

This file was deleted.

57 changes: 53 additions & 4 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "ci/ciEnv.hpp"
#include "code/compiledIC.hpp"
#include "compiler/compileTask.hpp"
#include "compiler/disassembler.hpp"
#include "compiler/oopMap.hpp"
Expand Down Expand Up @@ -965,7 +966,7 @@ int MacroAssembler::max_trampoline_stub_size() {
}

void MacroAssembler::emit_static_call_stub() {
// CompiledDirectStaticCall::set_to_interpreted knows the
// CompiledDirectCall::set_to_interpreted knows the
// exact layout of this stub.

isb();
Expand Down Expand Up @@ -995,10 +996,51 @@ address MacroAssembler::ic_call(address entry, jint method_index) {
// address const_ptr = long_constant((jlong)Universe::non_oop_word());
// uintptr_t offset;
// ldr_constant(rscratch2, const_ptr);
movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
movptr(rscratch2, (intptr_t)Universe::non_oop_word());
return trampoline_call(Address(entry, rh));
}

int MacroAssembler::ic_check_size() {
if (target_needs_far_branch(CAST_FROM_FN_PTR(address, SharedRuntime::get_ic_miss_stub()))) {
return NativeInstruction::instruction_size * 7;
} else {
return NativeInstruction::instruction_size * 5;
}
}

int MacroAssembler::ic_check(int end_alignment) {
Register receiver = j_rarg0;
Register data = rscratch2;
Register tmp1 = rscratch1;
Register tmp2 = r10;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice if we could still call verify_oop(receiver) here, but I see that would complicate ic_check_size().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought the same. As it's important for correctness that ic_check_size is accurate, I was hoping to have as few different modes in it as possible.

// The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
// before the inline cache check, so we don't have to execute any nop instructions when dispatching
// through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
// before the inline cache check here, and not after
align(end_alignment, offset() + ic_check_size());

int uep_offset = offset();

if (UseCompressedClassPointers) {
ldrw(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
ldrw(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
cmpw(tmp1, tmp2);
} else {
ldr(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
ldr(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
cmp(tmp1, tmp2);
}

Label dont;
br(Assembler::EQ, dont);
far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
bind(dont);
assert((offset() % end_alignment) == 0, "Misaligned verified entry point");

return uep_offset;
}

// Implementation of call_VM versions

void MacroAssembler::call_VM(Register oop_result,
Expand Down Expand Up @@ -1100,7 +1142,14 @@ void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thr
}

void MacroAssembler::align(int modulus) {
while (offset() % modulus != 0) nop();
align(modulus, offset());
}

// Ensure that the code at target bytes offset from the current offset() is aligned
// according to modulus.
void MacroAssembler::align(int modulus, int target) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to document what this extra align function does.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea. I wrote a comment.

int delta = target - offset();
while ((offset() + delta) % modulus != 0) nop();
}

void MacroAssembler::post_call_nop() {
Expand Down Expand Up @@ -1197,7 +1246,7 @@ void MacroAssembler::lookup_interface_method(Register recv_klass,
}

// Look up the method for a megamorphic invokeinterface call in a single pass over itable:
// - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICHolder
// - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
// - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
// The target method is determined by <holder_klass, itable_index>.
// The receiver klass is in recv_klass.
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,7 @@ class MacroAssembler: public Assembler {

// Alignment
void align(int modulus);
void align(int modulus, int target);

// nop
void post_call_nop();
Expand Down Expand Up @@ -1247,6 +1248,8 @@ class MacroAssembler: public Assembler {

// Emit the CompiledIC call idiom
address ic_call(address entry, jint method_index = 0);
static int ic_check_size();
int ic_check(int end_alignment);

public:

Expand Down
Loading