Skip to content

Commit

Permalink
8260355: AArch64: deoptimization stub should save vector registers
Browse files Browse the repository at this point in the history
Reviewed-by: vlivanov, aph
  • Loading branch information
nick-arm committed Feb 9, 2021
1 parent 5d8204b commit 5183d8a
Show file tree
Hide file tree
Showing 12 changed files with 182 additions and 75 deletions.
48 changes: 48 additions & 0 deletions src/hotspot/cpu/aarch64/registerMap_aarch64.cpp
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2021, Arm Limited. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

#include "precompiled.hpp"
#include "runtime/registerMap.hpp"
#include "vmreg_aarch64.inline.hpp"

address RegisterMap::pd_location(VMReg base_reg, int slot_idx) const {
if (base_reg->is_FloatRegister()) {
// Not all physical slots of an SVE register have corresponding
// VMRegs. However they are always saved to the stack in a
// contiguous region of memory so we can calculate the address of
// the upper slots by offsetting from the base address.
assert(base_reg->is_concrete(), "must pass base reg");
int base_reg_enc = (base_reg->value() - ConcreteRegisterImpl::max_gpr) /
FloatRegisterImpl::max_slots_per_register;
intptr_t offset_in_bytes = slot_idx * VMRegImpl::stack_slot_size;
address base_location = location(base_reg);
if (base_location != NULL) {
return base_location + offset_in_bytes;
} else {
return NULL;
}
} else {
return location(base_reg->next(slot_idx));
}
}
6 changes: 3 additions & 3 deletions src/hotspot/cpu/aarch64/registerMap_aarch64.hpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1998, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1998, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
Expand Down Expand Up @@ -32,8 +32,8 @@
private:
// This is the hook for finding a register in an "well-known" location,
// such as a register block of a predetermined format.
// Since there is none, we just return NULL.
address pd_location(VMReg reg) const {return NULL;}
address pd_location(VMReg reg) const { return NULL; }
address pd_location(VMReg base_reg, int slot_idx) const;

// no PD state to clear or copy:
void pd_clear() {}
Expand Down
130 changes: 72 additions & 58 deletions src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
Expand Up @@ -85,26 +85,24 @@ class SimpleRuntimeFrame {

// FIXME -- this is used by C1
class RegisterSaver {
const bool _save_vectors;
public:
static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
RegisterSaver(bool save_vectors) : _save_vectors(save_vectors) {}

OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
void restore_live_registers(MacroAssembler* masm);

// Offsets into the register save area
// Used by deoptimization when it is managing result register
// values on its own

static int r0_offset_in_bytes(void) { return (32 + r0->encoding()) * wordSize; }
static int reg_offset_in_bytes(Register r) { return r0_offset_in_bytes() + r->encoding() * wordSize; }
static int rmethod_offset_in_bytes(void) { return reg_offset_in_bytes(rmethod); }
static int rscratch1_offset_in_bytes(void) { return (32 + rscratch1->encoding()) * wordSize; }
static int v0_offset_in_bytes(void) { return 0; }
static int return_offset_in_bytes(void) { return (32 /* floats*/ + 31 /* gregs*/) * wordSize; }

// During deoptimization only the result registers need to be restored,
// all the other values have already been extracted.
static void restore_result_registers(MacroAssembler* masm);
int reg_offset_in_bytes(Register r);
int r0_offset_in_bytes() { return reg_offset_in_bytes(r0); }
int rscratch1_offset_in_bytes() { return reg_offset_in_bytes(rscratch1); }
int v0_offset_in_bytes(void) { return 0; }

// Capture info about frame layout
// Capture info about frame layout
// Note this is only correct when not saving full vectors.
enum layout {
fpu_state_off = 0,
fpu_state_end = fpu_state_off + FPUStateSizeInWords - 1,
Expand All @@ -119,7 +117,31 @@ class RegisterSaver {

};

OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
int RegisterSaver::reg_offset_in_bytes(Register r) {
// The integer registers are located above the floating point
// registers in the stack frame pushed by save_live_registers() so the
// offset depends on whether we are saving full vectors, and whether
// those vectors are NEON or SVE.

int slots_per_vect = FloatRegisterImpl::save_slots_per_register;

#if COMPILER2_OR_JVMCI
if (_save_vectors) {
slots_per_vect = FloatRegisterImpl::slots_per_neon_register;

#ifdef COMPILER2
if (Matcher::supports_scalable_vector()) {
slots_per_vect = Matcher::scalable_vector_reg_size(T_FLOAT);
}
#endif
}
#endif

int r0_offset = (slots_per_vect * FloatRegisterImpl::number_of_registers) * BytesPerInt;
return r0_offset + r->encoding() * wordSize;
}

OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
bool use_sve = false;
int sve_vector_size_in_bytes = 0;
int sve_vector_size_in_slots = 0;
Expand All @@ -131,7 +153,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
#endif

#if COMPILER2_OR_JVMCI
if (save_vectors) {
if (_save_vectors) {
int vect_words = 0;
int extra_save_slots_per_register = 0;
// Save upper half of vector registers
Expand All @@ -145,7 +167,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
additional_frame_words += vect_words;
}
#else
assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
assert(!_save_vectors, "vectors are generated only by C2 and JVMCI");
#endif

int frame_size_in_bytes = align_up(additional_frame_words * wordSize +
Expand All @@ -160,7 +182,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_

// Save Integer and Float registers.
__ enter();
__ push_CPU_state(save_vectors, use_sve, sve_vector_size_in_bytes);
__ push_CPU_state(_save_vectors, use_sve, sve_vector_size_in_bytes);

// Set an oopmap for the call site. This oopmap will map all
// oop-registers and debug-info registers as callee-saved. This
Expand All @@ -185,7 +207,7 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) {
FloatRegister r = as_FloatRegister(i);
int sp_offset = 0;
if (save_vectors) {
if (_save_vectors) {
sp_offset = use_sve ? (sve_vector_size_in_slots * i) :
(FloatRegisterImpl::slots_per_neon_register * i);
} else {
Expand All @@ -198,37 +220,20 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
return oop_map;
}

void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
#ifdef COMPILER2
__ pop_CPU_state(restore_vectors, Matcher::supports_scalable_vector(),
__ pop_CPU_state(_save_vectors, Matcher::supports_scalable_vector(),
Matcher::scalable_vector_reg_size(T_BYTE));
#else
#if !INCLUDE_JVMCI
assert(!restore_vectors, "vectors are generated only by C2 and JVMCI");
assert(!_save_vectors, "vectors are generated only by C2 and JVMCI");
#endif
__ pop_CPU_state(restore_vectors);
__ pop_CPU_state(_save_vectors);
#endif
__ leave();

}

void RegisterSaver::restore_result_registers(MacroAssembler* masm) {

// Just restore result register. Only used by deoptimization. By
// now any callee save register that needs to be restored to a c2
// caller of the deoptee has been extracted into the vframeArray
// and will be stuffed into the c2i adapter we create for later
// restoration so only result registers need to be restored here.

// Restore fp result register
__ ldrd(v0, Address(sp, v0_offset_in_bytes()));
// Restore integer result register
__ ldr(r0, Address(sp, r0_offset_in_bytes()));

// Pop all of the register save are off the stack
__ add(sp, sp, align_up(return_offset_in_bytes(), 16));
}

// Is vector's size (in bytes) bigger than a size saved by default?
// 8 bytes vector registers are saved by default on AArch64.
bool SharedRuntime::is_wide_vector(int size) {
Expand Down Expand Up @@ -2164,6 +2169,7 @@ void SharedRuntime::generate_deopt_blob() {
int frame_size_in_words;
OopMap* map = NULL;
OopMapSet *oop_maps = new OopMapSet();
RegisterSaver reg_save(COMPILER2_OR_JVMCI != 0);

// -------------
// This code enters when returning to a de-optimized nmethod. A return
Expand Down Expand Up @@ -2201,7 +2207,7 @@ void SharedRuntime::generate_deopt_blob() {
// Prolog for non exception case!

// Save everything in sight.
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = reg_save.save_live_registers(masm, 0, &frame_size_in_words);

// Normal deoptimization. Save exec mode for unpack_frames.
__ movw(rcpool, Deoptimization::Unpack_deopt); // callee-saved
Expand All @@ -2219,7 +2225,7 @@ void SharedRuntime::generate_deopt_blob() {
// return address is the pc describes what bci to do re-execute at

// No need to update map as each call to save_live_registers will produce identical oopmap
(void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
(void) reg_save.save_live_registers(masm, 0, &frame_size_in_words);

__ movw(rcpool, Deoptimization::Unpack_reexecute); // callee-saved
__ b(cont);
Expand All @@ -2238,7 +2244,7 @@ void SharedRuntime::generate_deopt_blob() {
uncommon_trap_offset = __ pc() - start;

// Save everything in sight.
RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
reg_save.save_live_registers(masm, 0, &frame_size_in_words);
// fetch_unroll_info needs to call last_java_frame()
Label retaddr;
__ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
Expand Down Expand Up @@ -2295,7 +2301,7 @@ void SharedRuntime::generate_deopt_blob() {
// This is a somewhat fragile mechanism.

// Save everything in sight.
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = reg_save.save_live_registers(masm, 0, &frame_size_in_words);

// Now it is safe to overwrite any register

Expand Down Expand Up @@ -2376,7 +2382,7 @@ void SharedRuntime::generate_deopt_blob() {
__ verify_oop(r0);

// Overwrite the result registers with the exception results.
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ str(r0, Address(sp, reg_save.r0_offset_in_bytes()));
// I think this is useless
// __ str(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));

Expand All @@ -2385,7 +2391,14 @@ void SharedRuntime::generate_deopt_blob() {
// Only register save data is on the stack.
// Now restore the result registers. Everything else is either dead
// or captured in the vframeArray.
RegisterSaver::restore_result_registers(masm);

// Restore fp result register
__ ldrd(v0, Address(sp, reg_save.v0_offset_in_bytes()));
// Restore integer result register
__ ldr(r0, Address(sp, reg_save.r0_offset_in_bytes()));

// Pop all of the register save area off the stack
__ add(sp, sp, frame_size_in_words * wordSize);

// All of the register save area has been popped of the stack. Only the
// return address remains.
Expand Down Expand Up @@ -2466,8 +2479,8 @@ void SharedRuntime::generate_deopt_blob() {
__ sub(sp, sp, (frame_size_in_words - 2) * wordSize);

// Restore frame locals after moving the frame
__ strd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ strd(v0, Address(sp, reg_save.v0_offset_in_bytes()));
__ str(r0, Address(sp, reg_save.r0_offset_in_bytes()));

// Call C code. Need thread but NOT official VM entry
// crud. We cannot block on this call, no GC can happen. Call should
Expand All @@ -2494,8 +2507,8 @@ void SharedRuntime::generate_deopt_blob() {
__ reset_last_Java_frame(true);

// Collect return values
__ ldrd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
__ ldr(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ ldrd(v0, Address(sp, reg_save.v0_offset_in_bytes()));
__ ldr(r0, Address(sp, reg_save.r0_offset_in_bytes()));
// I think this is useless (throwing pc?)
// __ ldr(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));

Expand Down Expand Up @@ -2741,10 +2754,10 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t
address call_pc = NULL;
int frame_size_in_words;
bool cause_return = (poll_type == POLL_AT_RETURN);
bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
RegisterSaver reg_save(poll_type == POLL_AT_VECTOR_LOOP /* save_vectors */);

// Save Integer and Float registers.
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
map = reg_save.save_live_registers(masm, 0, &frame_size_in_words);

// The following is basically a call_VM. However, we need the precise
// address of the call in order to generate an oopmap. Hence, we do all the
Expand Down Expand Up @@ -2789,7 +2802,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t

// Exception pending

RegisterSaver::restore_live_registers(masm, save_vectors);
reg_save.restore_live_registers(masm);

__ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));

Expand Down Expand Up @@ -2821,7 +2834,7 @@ SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_t

__ bind(no_adjust);
// Normal exit, restore registers and exit.
RegisterSaver::restore_live_registers(masm, save_vectors);
reg_save.restore_live_registers(masm);

__ ret(lr);

Expand Down Expand Up @@ -2855,13 +2868,14 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
MacroAssembler* masm = new MacroAssembler(&buffer);

int frame_size_in_words;
RegisterSaver reg_save(false /* save_vectors */);

OopMapSet *oop_maps = new OopMapSet();
OopMap* map = NULL;

int start = __ offset();

map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = reg_save.save_live_registers(masm, 0, &frame_size_in_words);

int frame_complete = __ offset();

Expand Down Expand Up @@ -2893,11 +2907,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha

// get the returned Method*
__ get_vm_result_2(rmethod, rthread);
__ str(rmethod, Address(sp, RegisterSaver::reg_offset_in_bytes(rmethod)));
__ str(rmethod, Address(sp, reg_save.reg_offset_in_bytes(rmethod)));

// r0 is where we want to jump, overwrite rscratch1 which is saved and scratch
__ str(r0, Address(sp, RegisterSaver::rscratch1_offset_in_bytes()));
RegisterSaver::restore_live_registers(masm);
__ str(r0, Address(sp, reg_save.rscratch1_offset_in_bytes()));
reg_save.restore_live_registers(masm);

// We are back the the original state on entry and ready to go.

Expand All @@ -2907,7 +2921,7 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha

__ bind(pending);

RegisterSaver::restore_live_registers(masm);
reg_save.restore_live_registers(masm);

// exception pending => remove activation and forward to exception handler

Expand Down

0 comments on commit 5183d8a

Please sign in to comment.