Skip to content
Permalink
Browse files
8266951: Partial in-lining for vectorized mismatch operation using AV…
…X512 masked instructions

Reviewed-by: psandoz, vlivanov
  • Loading branch information
Jatin Bhateja committed Jun 5, 2021
1 parent f768fbf commit b05c40ca3b5fd34cbbc7a9479b108a4ff2c099f1
@@ -2572,6 +2572,13 @@ void Assembler::knotwl(KRegister dst, KRegister src) {
emit_int16(0x44, (0xC0 | encode));
}

void Assembler::knotql(KRegister dst, KRegister src) {
assert(VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int16(0x44, (0xC0 | encode));
}

// This instruction produces ZF or CF flags
void Assembler::kortestbl(KRegister src1, KRegister src2) {
assert(VM_Version::supports_avx512dq(), "");
@@ -1480,6 +1480,7 @@ class Assembler : public AbstractAssembler {
void kmovql(Register dst, KRegister src);

void knotwl(KRegister dst, KRegister src);
void knotql(KRegister dst, KRegister src);

void kortestbl(KRegister dst, KRegister src);
void kortestwl(KRegister dst, KRegister src);
@@ -1923,7 +1923,7 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
}

void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
assert(ArrayCopyPartialInlineSize <= 64,"");
assert(ArrayOperationPartialInlineSize > 0 && ArrayOperationPartialInlineSize <= 64, "invalid");
mov64(temp, -1L);
bzhiq(temp, temp, len);
kmovql(dst, temp);
@@ -2140,11 +2140,37 @@ void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src
}
}

void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len) {
switch(typ) {
case T_BYTE:
case T_BOOLEAN:
evpcmpb(kdmask, ksmask, src1, src2, comparison, vector_len);
break;
case T_SHORT:
case T_CHAR:
evpcmpw(kdmask, ksmask, src1, src2, comparison, vector_len);
break;
case T_INT:
case T_FLOAT:
evpcmpd(kdmask, ksmask, src1, src2, comparison, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evpcmpq(kdmask, ksmask, src1, src2, comparison, vector_len);
break;
default:
assert(false,"Should not reach here.");
break;
}
}

void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
switch(typ) {
case T_BOOLEAN:
case T_BYTE:
evpcmpb(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
break;
case T_CHAR:
case T_SHORT:
evpcmpw(kdmask, ksmask, src1, adr, comparison, /*signed*/ true, vector_len, scratch);
break;
@@ -139,6 +139,7 @@

// blend
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, XMMRegister src2, int comparison, int vector_len);
void evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);

void load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt);
@@ -1419,12 +1419,12 @@ void VM_Version::get_processor_features() {
}
#ifdef COMPILER2
if (UseAVX > 2) {
if (FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) ||
(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize) &&
ArrayCopyPartialInlineSize != 0 &&
ArrayCopyPartialInlineSize != 32 &&
ArrayCopyPartialInlineSize != 16 &&
ArrayCopyPartialInlineSize != 64)) {
if (FLAG_IS_DEFAULT(ArrayOperationPartialInlineSize) ||
(!FLAG_IS_DEFAULT(ArrayOperationPartialInlineSize) &&
ArrayOperationPartialInlineSize != 0 &&
ArrayOperationPartialInlineSize != 16 &&
ArrayOperationPartialInlineSize != 32 &&
ArrayOperationPartialInlineSize != 64)) {
int inline_size = 0;
if (MaxVectorSize >= 64 && AVX3Threshold == 0) {
inline_size = 64;
@@ -1433,18 +1433,18 @@ void VM_Version::get_processor_features() {
} else if (MaxVectorSize >= 16) {
inline_size = 16;
}
if(!FLAG_IS_DEFAULT(ArrayCopyPartialInlineSize)) {
warning("Setting ArrayCopyPartialInlineSize as %d", inline_size);
if(!FLAG_IS_DEFAULT(ArrayOperationPartialInlineSize)) {
warning("Setting ArrayOperationPartialInlineSize as %d", inline_size);
}
ArrayCopyPartialInlineSize = inline_size;
ArrayOperationPartialInlineSize = inline_size;
}

if (ArrayCopyPartialInlineSize > MaxVectorSize) {
ArrayCopyPartialInlineSize = MaxVectorSize >= 16 ? MaxVectorSize : 0;
if (ArrayCopyPartialInlineSize) {
warning("Setting ArrayCopyPartialInlineSize as MaxVectorSize" INTX_FORMAT ")", MaxVectorSize);
if (ArrayOperationPartialInlineSize > MaxVectorSize) {
ArrayOperationPartialInlineSize = MaxVectorSize >= 16 ? MaxVectorSize : 0;
if (ArrayOperationPartialInlineSize) {
warning("Setting ArrayOperationPartialInlineSize as MaxVectorSize" INTX_FORMAT ")", MaxVectorSize);
} else {
warning("Setting ArrayCopyPartialInlineSize as " INTX_FORMAT, ArrayCopyPartialInlineSize);
warning("Setting ArrayOperationPartialInlineSize as " INTX_FORMAT, ArrayOperationPartialInlineSize);
}
}
}
@@ -1578,6 +1578,7 @@ const bool Matcher::match_rule_supported(int opcode) {
}
break;

case Op_VectorCmpMasked:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
@@ -1678,6 +1679,7 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
break;
case Op_ClearArray:
case Op_VectorMaskGen:
case Op_VectorCmpMasked:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!is_LP64 || !VM_Version::supports_avx512bw()) {
@@ -8084,7 +8086,34 @@ instruct vprorate(vec dst, vec src, vec shift) %{
%}

#ifdef _LP64
// ---------------------------------- Masked Block Copy ------------------------------------
// ---------------------------------- Masked Operations ------------------------------------

instruct vmask_cmp_node(rRegI dst, vec src1, vec src2, kReg mask, kReg ktmp1, kReg ktmp2, rFlagsReg cr) %{
match(Set dst (VectorCmpMasked src1 (Binary src2 mask)));
effect(TEMP_DEF dst, TEMP ktmp1, TEMP ktmp2, KILL cr);
format %{ "vector_mask_cmp $src1, $src2, $mask \t! vector mask comparison" %}
ins_encode %{
assert(vector_length_encoding(this, $src1) == vector_length_encoding(this, $src2), "mismatch");
assert(vector_element_basic_type(this, $src1) == vector_element_basic_type(this, $src2), "mismatch");

Label DONE;
int vlen_enc = vector_length_encoding(this, $src1);
BasicType elem_bt = vector_element_basic_type(this, $src1);

__ knotql($ktmp2$$KRegister, $mask$$KRegister);
__ mov64($dst$$Register, -1L);
__ evpcmp(elem_bt, $ktmp1$$KRegister, $mask$$KRegister, $src1$$XMMRegister, $src2$$XMMRegister, Assembler::eq, vlen_enc);
__ kortestql($ktmp2$$KRegister, $ktmp1$$KRegister);
__ jccb(Assembler::carrySet, DONE);
__ kmovql($dst$$Register, $ktmp1$$KRegister);
__ notq($dst$$Register);
__ tzcntq($dst$$Register, $dst$$Register);
__ bind(DONE);
%}
ins_pipe( pipe_slow );
%}


instruct vmasked_load64(vec dst, memory mem, kReg mask) %{
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
@@ -789,6 +789,7 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") ||
#endif
!strcmp(_matrule->_rChild->_opType,"StrInflatedCopy") ||
!strcmp(_matrule->_rChild->_opType,"VectorCmpMasked")||
!strcmp(_matrule->_rChild->_opType,"VectorMaskGen")||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
@@ -738,7 +738,7 @@ bool ArrayCopyNode::modifies(intptr_t offset_lo, intptr_t offset_hi, PhaseTransf

// As an optimization, choose optimum vector size for copy length known at compile time.
int ArrayCopyNode::get_partial_inline_vector_lane_count(BasicType type, int const_len) {
int lane_count = ArrayCopyPartialInlineSize/type2aelembytes(type);
int lane_count = ArrayOperationPartialInlineSize/type2aelembytes(type);
if (const_len > 0) {
int size_in_bytes = const_len * type2aelembytes(type);
if (size_in_bytes <= 16)
@@ -82,9 +82,10 @@
"actual size could be less depending on elements type") \
range(0, max_jint) \
\
product(intx, ArrayCopyPartialInlineSize, -1, DIAGNOSTIC, \
"Partial inline size used for array copy acceleration.") \
range(-1, 64) \
product(intx, ArrayOperationPartialInlineSize, 0, DIAGNOSTIC, \
"Partial inline size used for small array operations" \
"(e.g. copy,cmp) acceleration.") \
range(0, 64) \
\
product(bool, AlignVector, true, \
"Perform vector store/load alignment in loop") \
@@ -78,6 +78,11 @@ class CastIINode: public ConstraintCastNode {
: ConstraintCastNode(n, t, carry_dependency), _range_check_dependency(range_check_dependency) {
init_class_id(Class_CastII);
}
CastIINode(Node* ctrl, Node* n, const Type* t, bool carry_dependency = false, bool range_check_dependency = false)
: ConstraintCastNode(n, t, carry_dependency), _range_check_dependency(range_check_dependency) {
init_class_id(Class_CastII);
init_req(0, ctrl);
}
virtual int Opcode() const;
virtual uint ideal_reg() const { return Op_RegI; }
virtual Node* Identity(PhaseGVN* phase);
@@ -103,6 +108,11 @@ class CastIINode: public ConstraintCastNode {

class CastLLNode: public ConstraintCastNode {
public:
CastLLNode(Node* ctrl, Node* n, const Type* t, bool carry_dependency = false)
: ConstraintCastNode(n, t, carry_dependency) {
init_class_id(Class_CastLL);
init_req(0, ctrl);
}
CastLLNode(Node* n, const Type* t, bool carry_dependency = false)
: ConstraintCastNode(n, t, carry_dependency){
init_class_id(Class_CastLL);
@@ -417,6 +417,7 @@ macro(StoreVector)
macro(StoreVectorScatter)
macro(LoadVectorMasked)
macro(StoreVectorMasked)
macro(VectorCmpMasked)
macro(VectorMaskGen)
macro(VectorMaskOp)
macro(VectorMaskTrueCount)
@@ -3410,6 +3410,7 @@ void Compile::final_graph_reshaping_main_switch(Node* n, Final_Reshape_Counts& f
case Op_StoreVector:
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
case Op_VectorCmpMasked:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
Loading

1 comment on commit b05c40c

@openjdk-notifier

This comment has been minimized.

Copy link

@openjdk-notifier openjdk-notifier bot commented on b05c40c Jun 5, 2021

Please sign in to comment.