Skip to content
Permalink
Browse files
8277426: Optimize mask reduction operations on x86
Reviewed-by: sviswanathan, jiefu
  • Loading branch information
merykitty authored and DamonFool committed Nov 29, 2021
1 parent 3a4a94e commit 560f9c937233d548ef8db8cd9044fdc6c4cefe41
Showing 5 changed files with 193 additions and 80 deletions.
@@ -4306,6 +4306,20 @@ void Assembler::vpmovmskb(Register dst, XMMRegister src, int vec_enc) {
emit_int16((unsigned char)0xD7, (0xC0 | encode));
}

void Assembler::vmovmskps(Register dst, XMMRegister src, int vec_enc) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vec_enc, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int16(0x50, (0xC0 | encode));
}

void Assembler::vmovmskpd(Register dst, XMMRegister src, int vec_enc) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vec_enc, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x50, (0xC0 | encode));
}

void Assembler::vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
assert((VM_Version::supports_avx2() && vector_len == AVX_256bit), "");
InstructionMark im(this);
@@ -1774,6 +1774,8 @@ class Assembler : public AbstractAssembler {

void pmovmskb(Register dst, XMMRegister src);
void vpmovmskb(Register dst, XMMRegister src, int vec_enc);
void vmovmskps(Register dst, XMMRegister src, int vec_enc);
void vmovmskpd(Register dst, XMMRegister src, int vec_enc);
void vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

// SSE 4.1 extract
@@ -4060,61 +4060,123 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
}

#ifdef _LP64
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask,
Register tmp, int masklen, int masksize,
int vec_enc) {
if(VM_Version::supports_avx512bw()) {
kmovql(tmp, mask);
} else {
assert(masklen <= 16, "");
kmovwl(tmp, mask);
}
if (masksize < 16) {
andq(tmp, (((jlong)1 << masklen) - 1));
}
void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
switch(opc) {
case Op_VectorMaskTrueCount:
popcntq(dst, tmp);
break;
case Op_VectorMaskLastTrue:
mov64(dst, -1);
bsrq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
if (VM_Version::supports_lzcnt()) {
lzcntq(tmp, tmp);
movl(dst, 63);
subl(dst, tmp);
} else {
movl(dst, -1);
bsrq(tmp, tmp);
cmov32(Assembler::notZero, dst, tmp);
}
break;
case Op_VectorMaskFirstTrue:
mov64(dst, masklen);
bsfq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
if (VM_Version::supports_bmi1()) {
if (masklen < 32) {
orl(tmp, 1 << masklen);
tzcntl(dst, tmp);
} else if (masklen == 32) {
tzcntl(dst, tmp);
} else {
assert(masklen == 64, "");
tzcntq(dst, tmp);
}
} else {
if (masklen < 32) {
orl(tmp, 1 << masklen);
bsfl(dst, tmp);
} else {
assert(masklen == 32 || masklen == 64, "");
movl(dst, masklen);
if (masklen == 32) {
bsfl(tmp, tmp);
} else {
bsfq(tmp, tmp);
}
cmov32(Assembler::notZero, dst, tmp);
}
}
break;
case Op_VectorMaskToLong:
assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
break;
default: assert(false, "Unhandled mask operation");
}
}

void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
XMMRegister xtmp1, Register tmp, int masklen, int masksize,
int vec_enc) {
assert(VM_Version::supports_avx(), "");
vpxor(xtmp, xtmp, xtmp, vec_enc);
vpsubb(xtmp, xtmp, mask, vec_enc);
vpmovmskb(tmp, xtmp, vec_enc);
if (masksize < 16) {
andq(tmp, (((jlong)1 << masklen) - 1));
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
int masklen, int masksize, int vec_enc) {
assert(VM_Version::supports_popcnt(), "");

if(VM_Version::supports_avx512bw()) {
kmovql(tmp, mask);
} else {
assert(masklen <= 16, "");
kmovwl(tmp, mask);
}
switch(opc) {
case Op_VectorMaskTrueCount:
popcntq(dst, tmp);

// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
andq(tmp, (1 << masklen) - 1);
}

vector_mask_operation_helper(opc, dst, tmp, masklen);
}

void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
Register tmp, int masklen, BasicType bt, int vec_enc) {
assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
assert(VM_Version::supports_popcnt(), "");

bool need_clip = false;
switch(bt) {
case T_BOOLEAN:
// While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
vpxor(xtmp, xtmp, xtmp, vec_enc);
vpsubb(xtmp, xtmp, mask, vec_enc);
vpmovmskb(tmp, xtmp, vec_enc);
need_clip = masklen < 16;
break;
case Op_VectorMaskLastTrue:
mov64(dst, -1);
bsrq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
case T_BYTE:
vpmovmskb(tmp, mask, vec_enc);
need_clip = masklen < 16;
break;
case Op_VectorMaskFirstTrue:
mov64(dst, masklen);
bsfq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
case T_SHORT:
vpacksswb(xtmp, mask, mask, vec_enc);
if (masklen >= 16) {
vpermpd(xtmp, xtmp, 8, vec_enc);
}
vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
need_clip = masklen < 16;
break;
default: assert(false, "Unhandled mask operation");
case T_INT:
case T_FLOAT:
vmovmskps(tmp, mask, vec_enc);
need_clip = masklen < 4;
break;
case T_LONG:
case T_DOUBLE:
vmovmskpd(tmp, mask, vec_enc);
need_clip = masklen < 2;
break;
default: assert(false, "Unhandled type, %s", type2name(bt));
}

// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
if (need_clip && opc != Op_VectorMaskFirstTrue) {
// need_clip implies masklen < 32
andq(tmp, (1 << masklen) - 1);
}

vector_mask_operation_helper(opc, dst, tmp, masklen);
}
#endif
@@ -224,10 +224,12 @@

public:
#ifdef _LP64
void vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen);

void vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, int masklen, int masksize, int vec_enc);

void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, XMMRegister xtmp1,
Register tmp, int masklen, int masksize, int vec_enc);
void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
Register tmp, int masklen, BasicType bt, int vec_enc);
#endif
void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
@@ -8647,43 +8647,45 @@ instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
effect(TEMP dst, KILL cr);
format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
ins_encode %{
int mask_len = Matcher::vector_length(this, $mask);
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
if (VM_Version::supports_avx512vlbw()) {
__ kmovql($dst$$Register, $mask$$KRegister);
} else {
assert(mask_len <= 16, "");
__ kmovwl($dst$$Register, $mask$$KRegister);
}
// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
if (mask_size < 16) {
__ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
}
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$dst$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_tolong_avx(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL &&
n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskToLong mask));
format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$dst$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
__ vpsubb($xtmp$$XMMRegister, $xtmp$$XMMRegister, $mask$$XMMRegister, vlen_enc);
__ vpmovmskb($dst$$Register, $xtmp$$XMMRegister, vlen_enc);
// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
int mask_size = mask_len * type2aelembytes(mbt);
if (mask_size < 16) {
__ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
}
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$dst$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
@@ -8699,25 +8701,40 @@ instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register,
mask_len, mask_size, vlen_enc);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$tmp$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskTrueCount mask));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
@@ -8734,26 +8751,42 @@ instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsRe
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register, mask_len,
mask_size, vlen_enc);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$tmp$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskFirstTrue mask));
match(Set dst (VectorMaskLastTrue mask));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

1 comment on commit 560f9c9

@openjdk-notifier
Copy link

@openjdk-notifier openjdk-notifier bot commented on 560f9c9 Nov 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.