Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8262355: Support for AVX-512 opmask register allocation. #2768

Closed
wants to merge 23 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9e1c3e0
8262355: Support for AVX-512 opmask register allocation.
Feb 28, 2021
69003aa
8262355: Fix for AARCH64 build failure.
Feb 28, 2021
ffacbc6
8262355: Creating a new ideal type TypeVectMask for mask generating n…
Mar 4, 2021
4eaa3d8
8262355: Some synthetic changes for cleanup.
Mar 4, 2021
4fadca5
8262355 : Review comments resolution and deopt handling for mask gene…
Mar 10, 2021
f82741e
8262355: Fix for hs-minimal and windows build failures.
Mar 10, 2021
72f02a9
8262355: Fix for windows build failure.
Mar 10, 2021
a46b2bf
8262355: Removing object re-materialization handling for mask-generat…
Mar 11, 2021
df16ac0
8262355: Review comments resolution.
Mar 16, 2021
fcf2cce
8262355: Fix build failure
Mar 16, 2021
847fdce
8262355: Review comments resolutions.
Mar 17, 2021
f1748bf
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8262355
Mar 17, 2021
29e889e
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8262355
Mar 18, 2021
7751342
8262355: Review comments resolution
Mar 18, 2021
661fbda
8262355: Extending Type::isa_vect and Type::is_vect routines to TypeV…
Mar 21, 2021
8a05fbb
8262355: Review comments resolution.
Mar 24, 2021
837428d
8262355: Adding missed safety check.
Mar 25, 2021
13e791a
8262355: Updating copywriter for edited files.
Mar 28, 2021
eeea2d7
Merge branch 'master' of http://github.com/openjdk/jdk into JDK-8262355
Mar 29, 2021
5aa0730
8262355: Review comments resolutions.
Apr 1, 2021
366641a
8262355: Fix AARCH64 build issue
Apr 1, 2021
d6bec3d
Merge http://github.com/openjdk/jdk into JDK-8262355
Apr 1, 2021
b9810d2
8262355: Rebasing patch, 32bit clean-up.
Apr 2, 2021
File filter
Filter file types
Jump to
Jump to file
Failed to load files.

Always

Just for now

@@ -130,8 +130,8 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
int num_xmm_regs = XMMRegisterImpl::number_of_registers;
int ymm_bytes = num_xmm_regs * 16;
int zmm_bytes = num_xmm_regs * 32;
int opmask_state_bytes = KRegisterImpl::number_of_registers * 8;
#ifdef COMPILER2
int opmask_state_bytes = KRegisterImpl::number_of_registers * 8;
if (save_vectors) {
assert(UseAVX > 0, "Vectors larger than 16 byte long are supported only with AVX");
assert(MaxVectorSize <= 64, "Only up to 64 byte long vectors are supported");
@@ -140,12 +140,10 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
if (UseAVX > 2) {
// Save upper half of ZMM registers as well
vect_bytes += zmm_bytes;
additional_frame_words += opmask_state_bytes / wordSize;
}
additional_frame_words += vect_bytes / wordSize;
}
if(UseAVX > 2) {
additional_frame_words += opmask_state_bytes / wordSize;
}
#else
assert(!save_vectors, "vectors are generated only by C2");
#endif
@@ -233,17 +231,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
for (int n = 0; n < num_xmm_regs; n++) {
__ vextractf64x4_high(Address(rsp, n*32), as_XMMRegister(n));
}
__ subptr(rsp, opmask_state_bytes);
// Save opmask registers
for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
__ kmovql(Address(rsp, n*8), as_KRegister(n));
}
}
}
#ifdef COMPILER2
if (UseAVX > 2) {
__ subptr(rsp, opmask_state_bytes);
// Save opmask registers
for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
__ kmovql(Address(rsp, n*8), as_KRegister(n));
}
}
#endif
__ vzeroupper();

// Set an oopmap for the call site. This oopmap will map all
@@ -304,12 +298,10 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
if (UseAVX > 2) {
// Save upper half of ZMM registers as well
additional_frame_bytes += zmm_bytes;
opmask_state_bytes = KRegisterImpl::number_of_registers * 8;
additional_frame_bytes += opmask_state_bytes;
}
}
if (UseAVX > 2) {
opmask_state_bytes = KRegisterImpl::number_of_registers * 8;
additional_frame_bytes += opmask_state_bytes;
}
#else
assert(!restore_vectors, "vectors are generated only by C2");
#endif
@@ -347,22 +339,11 @@ void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_ve
for (int n = 0; n < num_xmm_regs; n++) {
__ vinsertf64x4_high(as_XMMRegister(n), Address(rsp, n*32+off));
}
#ifdef COMPILER2
for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
__ kmovql(as_KRegister(n), Address(rsp, n*8));
}
#endif
}
__ addptr(rsp, additional_frame_bytes);
} else {
#ifdef COMPILER2
if (UseAVX > 2) {
for (int n = 0; n < KRegisterImpl::number_of_registers; n++) {
__ kmovql(as_KRegister(n), Address(rsp, n*8));
}
__ addptr(rsp, additional_frame_bytes);
}
#endif
}

__ pop_FPU_state();
@@ -107,11 +107,17 @@ class RegisterSaver {
ymm_off = xmm_off + (XSAVE_AREA_YMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
DEF_YMM_OFFS(0),
DEF_YMM_OFFS(1),
// 2..7 are implied in range usage
// 2..15 are implied in range usage
opmask_off = xmm_off + (XSAVE_AREA_OPMASK_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
DEF_OPMASK_OFFS(0),
DEF_OPMASK_OFFS(1),
// 2..15 are implied in range usage
DEF_OPMASK_OFFS(2),
DEF_OPMASK_OFFS(3),
DEF_OPMASK_OFFS(4),
DEF_OPMASK_OFFS(5),
DEF_OPMASK_OFFS(6),
DEF_OPMASK_OFFS(7),
// 2..7 are implied in range usage
zmm_off = xmm_off + (XSAVE_AREA_ZMM_BEGIN - XSAVE_AREA_BEGIN)/BytesPerInt,
DEF_ZMM_OFFS(0),
DEF_ZMM_OFFS(1),
@@ -291,6 +297,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg());
off += delta;
}
map->set_callee_saved(STACK_OFFSET( opmask1_off ), k1->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask2_off ), k2->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask3_off ), k3->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask4_off ), k4->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask5_off ), k5->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask6_off ), k6->as_VMReg());
map->set_callee_saved(STACK_OFFSET( opmask7_off ), k7->as_VMReg());
}

#if COMPILER2_OR_JVMCI
@@ -351,6 +364,13 @@ OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_
map->set_callee_saved(STACK_OFFSET(off), zmm_name->as_VMReg()->next());
off += delta;
}
map->set_callee_saved(STACK_OFFSET( opmask1H_off ), k1->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask2H_off ), k2->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask3H_off ), k3->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask4H_off ), k4->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask5H_off ), k5->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask6H_off ), k6->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET( opmask7H_off ), k7->as_VMReg()->next());
}
}

@@ -425,7 +425,7 @@ void reg_mask_init() {
_INT_NO_RCX_REG_mask = _INT_REG_mask;
_INT_NO_RCX_REG_mask.Remove(OptoReg::as_OptoReg(rcx->as_VMReg()));

if(Matcher::has_predicated_vectors()) {
if (Matcher::has_predicated_vectors()) {
const_cast<RegMask*>(&_OPMASK_REG_mask)->Remove(OptoReg::as_OptoReg(k0->as_VMReg()));

This comment has been minimized.

@iwanowww

iwanowww Mar 23, 2021

I still don't see any reason to keep k0 declared, but then unconditionally removed from opmask_reg class.

You mentioned that KRegisterImpl::number_of_registers should be adjusted, but I don't see any reason why it should be done.

}
}
@@ -1404,7 +1404,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
} else if (dst_first_rc == rc_kreg) {
if ((src_first & 1) == 0 && src_first + 1 == src_second &&
(dst_first & 1) == 0 && dst_first + 1 == dst_second) {
// 64-bit
// 64-bit
if (cbuf) {
MacroAssembler _masm(cbuf);
__ kmovql(as_KRegister(Matcher::_regEncode[dst_first]), as_Register(Matcher::_regEncode[src_first]));
@@ -1541,7 +1541,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
}
}
return 0;
} else if(dst_first_rc == rc_int) {
} else if (dst_first_rc == rc_int) {
if ((src_first & 1) == 0 && src_first + 1 == src_second &&
(dst_first & 1) == 0 && dst_first + 1 == dst_second) {
// 64-bit
@@ -1558,7 +1558,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
}
Unimplemented();
return 0;
} else if(dst_first_rc == rc_kreg) {
} else if (dst_first_rc == rc_kreg) {
if ((src_first & 1) == 0 && src_first + 1 == src_second &&
(dst_first & 1) == 0 && dst_first + 1 == dst_second) {
// 64-bit
@@ -1574,7 +1574,7 @@ uint MachSpillCopyNode::implementation(CodeBuffer* cbuf,
}
}
return 0;
} else if(dst_first_rc == rc_float) {
} else if (dst_first_rc == rc_float) {
assert(false, "Illegal spill");
return 0;
}
@@ -50,6 +50,8 @@ void Location::print_on(outputStream* st) const {
case float_in_dbl: st->print(",float"); break;
case dbl: st->print(",double"); break;
case addr: st->print(",address"); break;
case vector: st->print(",vector"); break;
case vectorpred: st->print(",vectorpred"); break;
default: st->print("Wrong location type %d", type());
}
}
@@ -59,6 +59,7 @@ class Location {
float_in_dbl, // Float held in double register
dbl, // Double held in one register
vector, // Vector in one register
vectorpred, // Predicated register in one register
addr, // JSR return address
narrowoop // Narrow Oop (please GC me!)
};
@@ -919,7 +919,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
}
break;
case Op_RegVMask:
lrg.set_num_regs(RegMask::SlotsPerRegVmask);
lrg.set_num_regs(RegMask::SlotsPerRegVMask);

This comment has been minimized.

@iwanowww

iwanowww Mar 16, 2021

Add assert(Matcher::has_predicated_vectors(), "...") here?

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Mar 17, 2021
Author Member

This will interfere with non-AVX512 targets, since some of instruction patterns have kReg temporary operands which will still be allocated but not JITed.

lrg.set_reg_pressure(1);
break;
case Op_RegF:
@@ -897,6 +897,8 @@ void PhaseOutput::FillLocArray( int idx, MachSafePointNode* sfpt, Node *local,
t->base() == Type::VectorD || t->base() == Type::VectorX ||
t->base() == Type::VectorY || t->base() == Type::VectorZ) {
array->append(new_loc_value( C->regalloc(), regnum, Location::vector ));
} else if ( t->base() == Type::VectorM ) {
array->append(new_loc_value( C->regalloc(), regnum, Location::vectorpred ));
} else {
array->append(new_loc_value( C->regalloc(), regnum, C->regalloc()->is_oop(local) ? Location::oop : Location::normal ));
}
@@ -983,6 +985,14 @@ void PhaseOutput::FillLocArray( int idx, MachSafePointNode* sfpt, Node *local,
case Type::Top: // Add an illegal value here
array->append(new LocationValue(Location()));
break;
case Type::VectorA:
case Type::VectorS:
case Type::VectorD:
case Type::VectorX:
case Type::VectorY:
case Type::VectorZ:
case Type::VectorM:
// Vector payload should always be present in a physical vector register location.
default:
ShouldNotReachHere();
break;
@@ -68,7 +68,7 @@ int RegMask::num_registers(uint ireg) {
case Op_VecD:
return SlotsPerVecD;
case Op_RegVMask:
return SlotsPerRegVmask;
return SlotsPerRegVMask;

This comment has been minimized.

@iwanowww

iwanowww Mar 16, 2021

Should there be assert(Matcher::has_predicated_vectors(), "...") ?

case Op_RegD:
case Op_RegL:
#ifdef _LP64
@@ -105,7 +105,7 @@ class RegMask {
SlotsPerVecX = 4,
SlotsPerVecY = 8,
SlotsPerVecZ = 16,
SlotsPerRegVmask = X86_ONLY(2) NOT_X86(1)
SlotsPerRegVMask = X86_ONLY(2) NOT_X86(1)

This comment has been minimized.

@iwanowww

iwanowww Mar 16, 2021

It begs for either multiple definitions (like SlotsPerVec*) or variable-sized like SlotsPerVecA.

Would like to hear from @XiaohongGong and @nsjian on that.

This comment has been minimized.

@nsjian

nsjian Mar 17, 2021
Contributor

I am fine with current code. Since we don't have other SlotsPerRegVMask* yet, we will treat it as variable-sized slots for SVE, which is 1.

This comment has been minimized.

@iwanowww

iwanowww Mar 18, 2021

Ok, maybe set SlotsPerRegVMask = 2 unconditionally then?

This comment has been minimized.

@nsjian

nsjian Mar 19, 2021
Contributor

Since SVE predicate reg size is 1 ~ 8 slots, I think 1 would be better here, which can save one mask bit and make bit mask handling easier (e.g. allocation on stack).

};

// A constructor only used by the ADLC output. All mask fields are filled
@@ -661,7 +661,7 @@ void Type::Initialize_shared(Compile* current) {
// get_zero_type() should not happen for T_CONFLICT
_zero_type[T_CONFLICT]= NULL;

#if defined(AMD64) || defined(IA32)
#if defined(X86)

This comment has been minimized.

@iwanowww

iwanowww Mar 16, 2021

Why is it X86-specific? Should it be simply guarded by Matcher::has_predicated_vectors() instead?

This comment has been minimized.

@nsjian

nsjian Mar 17, 2021
Contributor

Yes, SVE should also be fine with this. Perhaps to use the default type TypeInt::BYTE instead of TypeInt::BOOL? It seems that masking byte type looks more reasonable?

TypeVect::VMASK = (TypeVect*)(new TypeVectMask(get_const_basic_type(T_BOOLEAN), MaxVectorSize))->hashcons();
mreg2type[Op_RegVMask] = TypeVect::VMASK;
#endif
@@ -303,7 +303,7 @@ class Type {
const TypeVect *is_vect() const; // Vector
const TypeVect *isa_vect() const; // Returns NULL if not a Vector
const TypeVectMask *is_vectmask() const; // Predicate/Mask Vector
const TypeVectMask *isa_vectmask() const; // Returns NULL if not a Predicate/Mask Vector
const TypeVectMask *isa_vectmask() const; // Returns NULL if not a Vector Predicate/Mask
const TypePtr *is_ptr() const; // Asserts it is a ptr type
const TypePtr *isa_ptr() const; // Returns NULL if not ptr type
const TypeRawPtr *isa_rawptr() const; // NOT Java oop
@@ -86,24 +86,34 @@ jint VectorSupport::klass2length(InstanceKlass* ik) {
return vlen;
}

void VectorSupport::init_payload_element(typeArrayOop arr, bool is_mask, BasicType elem_bt, int index, address addr) {
void VectorSupport::init_payload_element(typeArrayOop arr, bool is_mask, BasicType elem_bt, int index, address addr, Location loc) {

This comment has been minimized.

@iwanowww

iwanowww Mar 16, 2021

You can just revert changes in vectorSupport.cpp/vectorSupport.hpp

if (is_mask) {
// Masks require special handling: when boxed they are packed and stored in boolean
// arrays, but in scalarized form they have the same size as corresponding vectors.
// For example, Int512Mask is represented in memory as boolean[16], but
// occupies the whole 512-bit vector register when scalarized.
// (In generated code, the conversion is performed by VectorStoreMask.)
//
// TODO: revisit when predicate registers are fully supported.

This comment has been minimized.

@iwanowww

iwanowww Mar 18, 2021

A leftover from the reverted change.

switch (elem_bt) {
case T_BYTE: arr->bool_at_put(index, (*(jbyte*)addr) != 0); break;
case T_SHORT: arr->bool_at_put(index, (*(jshort*)addr) != 0); break;
case T_INT: // fall-through
case T_FLOAT: arr->bool_at_put(index, (*(jint*)addr) != 0); break;
case T_LONG: // fall-through
case T_DOUBLE: arr->bool_at_put(index, (*(jlong*)addr) != 0); break;

default: fatal("unsupported: %s", type2name(elem_bt));
if ( loc.type() == Location::vectorpred ) {
#if defined(X86)
// For targets supporting X86-AVX512 feature, opmask/predicate register is 8 byte(64 bits) wide,
// where each bit location corresponds to a lane element in vector register. This is different
// for non-AVX512 targets where mask vector has same shape as that of the vector register its operating on.
assert(Matcher::has_predicated_vectors() , "");
arr->bool_at_put(index, ((*(jlong*)addr) & (1 << index)) != 0);
return;
#endif
} else {
// Masks require special handling: when boxed they are packed and stored in boolean
// arrays, but in scalarized form they have the same size as corresponding vectors.
// For example, Int512Mask is represented in memory as boolean[16], but
// occupies the whole 512-bit vector register when scalarized.
// (In generated code, the conversion is performed by VectorStoreMask.)
//
switch (elem_bt) {
case T_BYTE: arr->bool_at_put(index, (*(jbyte*)addr) != 0); break;
case T_SHORT: arr->bool_at_put(index, (*(jshort*)addr) != 0); break;
case T_INT: // fall-through
case T_FLOAT: arr->bool_at_put(index, (*(jint*)addr) != 0); break;
case T_LONG: // fall-through
case T_DOUBLE: arr->bool_at_put(index, (*(jlong*)addr) != 0); break;

default: fatal("unsupported: %s", type2name(elem_bt));
}
}
} else {
switch (elem_bt) {
@@ -135,26 +145,46 @@ Handle VectorSupport::allocate_vector_payload_helper(InstanceKlass* ik, frame* f
// Value was in a callee-saved register.
VMReg vreg = VMRegImpl::as_VMReg(location.register_number());

for (int i = 0; i < num_elem; i++) {
int vslot = (i * elem_size) / VMRegImpl::stack_slot_size;
int off = (i * elem_size) % VMRegImpl::stack_slot_size;

address elem_addr = reg_map->location(vreg, vslot) + off; // assumes little endian element order
init_payload_element(arr, is_mask, elem_bt, i, elem_addr);
if (location.type() == Location::vectorpred) {
#if defined(X86)
// For X86 two adjacent stack slots hold 64 bit opmask register value.
address elem_addr = reg_map->location(vreg, 0); // assumes little endian element order
for (int i = 0; i < num_elem; i++) {
init_payload_element(arr, is_mask, elem_bt, i, elem_addr, location);
}
#endif
} else {
for (int i = 0; i < num_elem; i++) {
int vslot = (i * elem_size) / VMRegImpl::stack_slot_size;
int off = (i * elem_size) % VMRegImpl::stack_slot_size;

address elem_addr = reg_map->location(vreg, vslot) + off; // assumes little endian element order
init_payload_element(arr, is_mask, elem_bt, i, elem_addr, location);
}
}
} else {
// Value was directly saved on the stack.
address base_addr = ((address)fr->unextended_sp()) + location.stack_offset();
for (int i = 0; i < num_elem; i++) {
init_payload_element(arr, is_mask, elem_bt, i, base_addr + i * elem_size);
if (location.type() == Location::vectorpred) {
#if defined(X86)
// For X86 two adjacent stack slots hold 64 bit opmask register value.
for (int i = 0; i < num_elem; i++) {
init_payload_element(arr, is_mask, elem_bt, i, base_addr, location);
}
#endif
} else {
for (int i = 0; i < num_elem; i++) {
init_payload_element(arr, is_mask, elem_bt, i, base_addr + i * elem_size, location);
}
}
}
return Handle(THREAD, arr);
}

Handle VectorSupport::allocate_vector_payload(InstanceKlass* ik, frame* fr, RegisterMap* reg_map, ScopeValue* payload, TRAPS) {
if (payload->is_location() &&
payload->as_LocationValue()->location().type() == Location::vector) {
(payload->as_LocationValue()->location().type() == Location::vector ||
payload->as_LocationValue()->location().type() == Location::vectorpred)) {
// Vector value in an aligned adjacent tuple (1, 2, 4, 8, or 16 slots).
Location location = payload->as_LocationValue()->location();
return allocate_vector_payload_helper(ik, fr, reg_map, location, THREAD); // safepoint
@@ -42,7 +42,7 @@ class VectorSupport : AllStatic {
static Handle allocate_vector_payload(InstanceKlass* ik, frame* fr, RegisterMap* reg_map, ScopeValue* payload, TRAPS);
static Handle allocate_vector_payload_helper(InstanceKlass* ik, frame* fr, RegisterMap* reg_map, Location location, TRAPS);

static void init_payload_element(typeArrayOop arr, bool is_mask, BasicType elem_bt, int index, address addr);
static void init_payload_element(typeArrayOop arr, bool is_mask, BasicType elem_bt, int index, address addr, Location location);

static BasicType klass2bt(InstanceKlass* ik);
static jint klass2length(InstanceKlass* ik);
@@ -153,6 +153,7 @@ StackValue* StackValue::create_stack_value(const frame* fr, const RegisterMap* r
case Location::invalid: {
return new StackValue();
}
case Location::vectorpred:
case Location::vector: {
ShouldNotReachHere(); // should be handled by Deoptimization::realloc_objects()
}
ProTip! Use n and p to navigate between commits in a pull request.