Navigation Menu

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8260355: AArch64: deoptimization stub should save vector registers #2279

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
95 changes: 53 additions & 42 deletions src/hotspot/cpu/aarch64/sharedRuntime_aarch64.cpp
Expand Up @@ -83,25 +83,20 @@ class SimpleRuntimeFrame {
// FIXME -- this is used by C1
class RegisterSaver {
public:
static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors = false);
static void restore_live_registers(MacroAssembler* masm, bool restore_vectors = false);
static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors);
static void restore_live_registers(MacroAssembler* masm, bool restore_vectors);

// Offsets into the register save area
// Used by deoptimization when it is managing result register
// values on its own

static int r0_offset_in_bytes(void) { return (32 + r0->encoding()) * wordSize; }
static int reg_offset_in_bytes(Register r) { return r0_offset_in_bytes() + r->encoding() * wordSize; }
static int rmethod_offset_in_bytes(void) { return reg_offset_in_bytes(rmethod); }
static int rscratch1_offset_in_bytes(void) { return (32 + rscratch1->encoding()) * wordSize; }
static int reg_offset_in_bytes(Register r, bool save_vectors);
static int r0_offset_in_bytes(bool save_vectors) { return reg_offset_in_bytes(r0, save_vectors); }
static int rscratch1_offset_in_bytes(bool save_vectors) { return reg_offset_in_bytes(rscratch1, save_vectors); }
static int v0_offset_in_bytes(void) { return 0; }
static int return_offset_in_bytes(void) { return (32 /* floats*/ + 31 /* gregs*/) * wordSize; }

// During deoptimization only the result registers need to be restored,
// all the other values have already been extracted.
static void restore_result_registers(MacroAssembler* masm);

// Capture info about frame layout
// Capture info about frame layout
// Note this is only correct when not saving full vectors.
enum layout {
fpu_state_off = 0,
fpu_state_end = fpu_state_off + FPUStateSizeInWords - 1,
Expand All @@ -116,6 +111,30 @@ class RegisterSaver {

};

int RegisterSaver::reg_offset_in_bytes(Register r, bool save_vectors) {
// The integer registers are located above the floating point
// registers in the stack frame pushed by save_live_registers() so the
// offset depends on whether we are saving full vectors, and whether
// those vectors are NEON or SVE.

int slots_per_vect = FloatRegisterImpl::save_slots_per_register;

#if COMPILER2_OR_JVMCI
if (save_vectors) {
slots_per_vect = FloatRegisterImpl::slots_per_neon_register;

#ifdef COMPILER2
if (Matcher::supports_scalable_vector()) {
slots_per_vect = Matcher::scalable_vector_reg_size(T_FLOAT);
}
#endif
}
#endif

int r0_offset = (slots_per_vect * FloatRegisterImpl::number_of_registers) * BytesPerInt;
return r0_offset + r->encoding() * wordSize;
}

OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool save_vectors) {
bool use_sve = false;
int sve_vector_size_in_bytes = 0;
Expand Down Expand Up @@ -209,23 +228,6 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve

}

void RegisterSaver::restore_result_registers(MacroAssembler* masm) {

// Just restore result register. Only used by deoptimization. By
// now any callee save register that needs to be restored to a c2
// caller of the deoptee has been extracted into the vframeArray
// and will be stuffed into the c2i adapter we create for later
// restoration so only result registers need to be restored here.

// Restore fp result register
__ ldrd(v0, Address(sp, v0_offset_in_bytes()));
// Restore integer result register
__ ldr(r0, Address(sp, r0_offset_in_bytes()));

// Pop all of the register save are off the stack
__ add(sp, sp, align_up(return_offset_in_bytes(), 16));
}

// Is vector's size (in bytes) bigger than a size saved by default?
// 8 bytes vector registers are saved by default on AArch64.
bool SharedRuntime::is_wide_vector(int size) {
Expand Down Expand Up @@ -2161,6 +2163,7 @@ void SharedRuntime::generate_deopt_blob() {
int frame_size_in_words;
OopMap* map = NULL;
OopMapSet *oop_maps = new OopMapSet();
bool save_vectors = COMPILER2_OR_JVMCI != 0;

// -------------
// This code enters when returning to a de-optimized nmethod. A return
Expand Down Expand Up @@ -2198,7 +2201,7 @@ void SharedRuntime::generate_deopt_blob() {
// Prolog for non exception case!

// Save everything in sight.
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);

// Normal deoptimization. Save exec mode for unpack_frames.
__ movw(rcpool, Deoptimization::Unpack_deopt); // callee-saved
Expand All @@ -2216,7 +2219,7 @@ void SharedRuntime::generate_deopt_blob() {
// return address is the pc describes what bci to do re-execute at

// No need to update map as each call to save_live_registers will produce identical oopmap
(void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
(void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);

__ movw(rcpool, Deoptimization::Unpack_reexecute); // callee-saved
__ b(cont);
Expand All @@ -2235,7 +2238,7 @@ void SharedRuntime::generate_deopt_blob() {
uncommon_trap_offset = __ pc() - start;

// Save everything in sight.
RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);
// fetch_unroll_info needs to call last_java_frame()
Label retaddr;
__ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
Expand Down Expand Up @@ -2292,7 +2295,7 @@ void SharedRuntime::generate_deopt_blob() {
// This is a somewhat fragile mechanism.

// Save everything in sight.
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);

// Now it is safe to overwrite any register

Expand Down Expand Up @@ -2373,7 +2376,7 @@ void SharedRuntime::generate_deopt_blob() {
__ verify_oop(r0);

// Overwrite the result registers with the exception results.
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes(save_vectors)));
// I think this is useless
// __ str(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));

Expand All @@ -2382,7 +2385,14 @@ void SharedRuntime::generate_deopt_blob() {
// Only register save data is on the stack.
// Now restore the result registers. Everything else is either dead
// or captured in the vframeArray.
RegisterSaver::restore_result_registers(masm);

// Restore fp result register
__ ldrd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
// Restore integer result register
__ ldr(r0, Address(sp, RegisterSaver::r0_offset_in_bytes(save_vectors)));

// Pop all of the register save area off the stack
__ add(sp, sp, frame_size_in_words * wordSize);

// All of the register save area has been popped of the stack. Only the
// return address remains.
Expand Down Expand Up @@ -2464,7 +2474,7 @@ void SharedRuntime::generate_deopt_blob() {

// Restore frame locals after moving the frame
__ strd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes(save_vectors)));

// Call C code. Need thread but NOT official VM entry
// crud. We cannot block on this call, no GC can happen. Call should
Expand Down Expand Up @@ -2492,7 +2502,7 @@ void SharedRuntime::generate_deopt_blob() {

// Collect return values
__ ldrd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
__ ldr(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
__ ldr(r0, Address(sp, RegisterSaver::r0_offset_in_bytes(save_vectors)));
// I think this is useless (throwing pc?)
// __ ldr(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));

Expand Down Expand Up @@ -2852,13 +2862,14 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha
MacroAssembler* masm = new MacroAssembler(&buffer);

int frame_size_in_words;
bool save_vectors = false;

OopMapSet *oop_maps = new OopMapSet();
OopMap* map = NULL;

int start = __ offset();

map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, save_vectors);

int frame_complete = __ offset();

Expand Down Expand Up @@ -2890,11 +2901,11 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha

// get the returned Method*
__ get_vm_result_2(rmethod, rthread);
__ str(rmethod, Address(sp, RegisterSaver::reg_offset_in_bytes(rmethod)));
__ str(rmethod, Address(sp, RegisterSaver::reg_offset_in_bytes(rmethod, save_vectors)));

// r0 is where we want to jump, overwrite rscratch1 which is saved and scratch
__ str(r0, Address(sp, RegisterSaver::rscratch1_offset_in_bytes()));
RegisterSaver::restore_live_registers(masm);
__ str(r0, Address(sp, RegisterSaver::rscratch1_offset_in_bytes(save_vectors)));
RegisterSaver::restore_live_registers(masm, save_vectors);

// We are back the the original state on entry and ready to go.

Expand All @@ -2904,7 +2915,7 @@ RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const cha

__ bind(pending);

RegisterSaver::restore_live_registers(masm);
RegisterSaver::restore_live_registers(masm, save_vectors);

// exception pending => remove activation and forward to exception handler

Expand Down
16 changes: 12 additions & 4 deletions src/hotspot/share/prims/vectorSupport.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -135,10 +135,18 @@ Handle VectorSupport::allocate_vector_payload_helper(InstanceKlass* ik, frame* f
VMReg vreg = VMRegImpl::as_VMReg(location.register_number());

for (int i = 0; i < num_elem; i++) {
int vslot = (i * elem_size) / VMRegImpl::stack_slot_size;
int off = (i * elem_size) % VMRegImpl::stack_slot_size;
bool contiguous = X86_ONLY(false) NOT_X86(true);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't like this change. It's not x86-specific, but SVE-specific code. What is broken here is VMReg::next() doesn't work properly for VecA registers. And, as a result, it makes RegisterMap::location(VMReg) unusable as well.

So, a proper fix should address that instead. If there's no way to save VMReg::next() and RegisterMap::location(VMReg), then new cross-platform API should be introduced and VectorSupport::allocate_vector_payload_helper() migrated to it.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For Arm NEON (and PPC) we don't set VMReg::next() in oopmap either, and their vector slots are contiguous, so that's x86-specific? But yes, NEON can also generate correct full oopmap as for fixed vector size. For SVE, I have no idea to have proper VMReg::next() support, so Nick's solution looks good to me. Regarding to introducing new cross-platform API, which API do you mean? If we could have some better api, that would be perfect. Currently, allocate_vector_payload_helper() is the only one I can see that is vector related for RegisterMap::location() call.

Copy link

@iwanowww iwanowww Jan 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably, x86 is unique in using non-contiguous representation for vector values, but it doesn't make the code in question x86-specific. AArch64 is the only user ofVecA and VecA is the only register type that has a mismatch in size between in-memory and RegMask representation. So, I conclude it is AArch64/SVE-specific.

On x86 RegisterMap isn't fully populated for vector registers as well, but there'sRegisterMap::pd_location() to cover that.

Regarding new API, I mean the alternative to VMReg::next()/RegisterMap::location(VMReg) which is able to handle VecA case well. As Nick pointed out earlier, the problem with VecA is that there's no VMReg representation for all the slots which comprise the register value.

Either enhancing VMReg::next(int) to produce special values for VecA case or introducing RegisterMap::location(VMReg base_reg, int slot) is a better way to handle the problem.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iwanowww please take a look at the latest set of changes and let me know what you think. There's now a RegisterMap::location(VMReg base_reg, int slot) method as you suggest. That in turn uses a new method VMReg::is_expressible(int slot_delta) which is true if offsetting a VMReg by slot_delta slots gives another valid VMReg which is also a slot of the same physical register (i.e. reg->next(slot_delta) is valid). We can use this to fall back to pd_location if a slot of a vector is not expressible as a VMReg (i.e. for SVE). Unfortunately it touches a lot of files but that seems unavoidable.

address elem_addr;

if (contiguous) {
elem_addr = reg_map->location(vreg) + (i * elem_size);
} else {
int vslot = (i * elem_size) / VMRegImpl::stack_slot_size;
int off = (i * elem_size) % VMRegImpl::stack_slot_size;

elem_addr = reg_map->location(vreg->next(vslot)) + off;
}

address elem_addr = reg_map->location(vreg->next(vslot)) + off;
init_payload_element(arr, is_mask, elem_bt, i, elem_addr);
}
} else {
Expand Down