Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8277426: Optimize mask reduction operations on x86 #6447

Closed
wants to merge 4 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -4306,6 +4306,20 @@ void Assembler::vpmovmskb(Register dst, XMMRegister src, int vec_enc) {
emit_int16((unsigned char)0xD7, (0xC0 | encode));
}

void Assembler::vmovmskps(Register dst, XMMRegister src, int vec_enc) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vec_enc, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int16(0x50, (0xC0 | encode));
}

void Assembler::vmovmskpd(Register dst, XMMRegister src, int vec_enc) {
assert(VM_Version::supports_avx(), "");
InstructionAttr attributes(vec_enc, /* rex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x50, (0xC0 | encode));
}

void Assembler::vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
assert((VM_Version::supports_avx2() && vector_len == AVX_256bit), "");
InstructionMark im(this);
@@ -1774,6 +1774,8 @@ class Assembler : public AbstractAssembler {

void pmovmskb(Register dst, XMMRegister src);
void vpmovmskb(Register dst, XMMRegister src, int vec_enc);
void vmovmskps(Register dst, XMMRegister src, int vec_enc);
void vmovmskpd(Register dst, XMMRegister src, int vec_enc);
void vpmaskmovd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

// SSE 4.1 extract
@@ -4060,61 +4060,123 @@ void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
}

#ifdef _LP64
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask,
Register tmp, int masklen, int masksize,
int vec_enc) {
if(VM_Version::supports_avx512bw()) {
kmovql(tmp, mask);
} else {
assert(masklen <= 16, "");
kmovwl(tmp, mask);
}
if (masksize < 16) {
andq(tmp, (((jlong)1 << masklen) - 1));
}
void C2_MacroAssembler::vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen) {
switch(opc) {
case Op_VectorMaskTrueCount:
popcntq(dst, tmp);
break;
case Op_VectorMaskLastTrue:
mov64(dst, -1);
bsrq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
if (VM_Version::supports_lzcnt()) {
lzcntq(tmp, tmp);
movl(dst, 63);
subl(dst, tmp);
} else {
movl(dst, -1);
bsrq(tmp, tmp);
cmov32(Assembler::notZero, dst, tmp);
}
break;
case Op_VectorMaskFirstTrue:
mov64(dst, masklen);
bsfq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
if (VM_Version::supports_bmi1()) {
if (masklen < 32) {
orl(tmp, 1 << masklen);
tzcntl(dst, tmp);
} else if (masklen == 32) {
tzcntl(dst, tmp);
} else {
assert(masklen == 64, "");
tzcntq(dst, tmp);
}
} else {
if (masklen < 32) {
orl(tmp, 1 << masklen);
bsfl(dst, tmp);
} else {
assert(masklen == 32 || masklen == 64, "");
movl(dst, masklen);
if (masklen == 32) {
bsfl(tmp, tmp);
} else {
bsfq(tmp, tmp);
}
cmov32(Assembler::notZero, dst, tmp);
}
}
break;
case Op_VectorMaskToLong:
assert(dst == tmp, "Dst and tmp should be the same for toLong operations");
break;
default: assert(false, "Unhandled mask operation");
}
}

void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
XMMRegister xtmp1, Register tmp, int masklen, int masksize,
int vec_enc) {
assert(VM_Version::supports_avx(), "");
vpxor(xtmp, xtmp, xtmp, vec_enc);
vpsubb(xtmp, xtmp, mask, vec_enc);
vpmovmskb(tmp, xtmp, vec_enc);
if (masksize < 16) {
andq(tmp, (((jlong)1 << masklen) - 1));
void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp,
int masklen, int masksize, int vec_enc) {
assert(VM_Version::supports_popcnt(), "");

if(VM_Version::supports_avx512bw()) {
kmovql(tmp, mask);
} else {
assert(masklen <= 16, "");
kmovwl(tmp, mask);
}
switch(opc) {
case Op_VectorMaskTrueCount:
popcntq(dst, tmp);

// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
if (masksize < 16 && opc != Op_VectorMaskFirstTrue) {
andq(tmp, (1 << masklen) - 1);
}

vector_mask_operation_helper(opc, dst, tmp, masklen);
}

void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
Register tmp, int masklen, BasicType bt, int vec_enc) {
assert(vec_enc == AVX_128bit && VM_Version::supports_avx() ||
vec_enc == AVX_256bit && (VM_Version::supports_avx2() || type2aelembytes(bt) >= 4), "");
assert(VM_Version::supports_popcnt(), "");

bool need_clip = false;
switch(bt) {
case T_BOOLEAN:
// While masks of other types contain 0, -1; boolean masks contain lane values of 0, 1
vpxor(xtmp, xtmp, xtmp, vec_enc);
vpsubb(xtmp, xtmp, mask, vec_enc);
vpmovmskb(tmp, xtmp, vec_enc);
need_clip = masklen < 16;
break;
case Op_VectorMaskLastTrue:
mov64(dst, -1);
bsrq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
case T_BYTE:
vpmovmskb(tmp, mask, vec_enc);
need_clip = masklen < 16;
break;
case Op_VectorMaskFirstTrue:
mov64(dst, masklen);
bsfq(tmp, tmp);
cmov(Assembler::notZero, dst, tmp);
case T_SHORT:
vpacksswb(xtmp, mask, mask, vec_enc);
if (masklen >= 16) {
vpermpd(xtmp, xtmp, 8, vec_enc);
}
vpmovmskb(tmp, xtmp, Assembler::AVX_128bit);
need_clip = masklen < 16;
break;
default: assert(false, "Unhandled mask operation");
case T_INT:
case T_FLOAT:
vmovmskps(tmp, mask, vec_enc);
need_clip = masklen < 4;
break;
case T_LONG:
case T_DOUBLE:
vmovmskpd(tmp, mask, vec_enc);
need_clip = masklen < 2;
break;
default: assert(false, "Unhandled type, %s", type2name(bt));
}

// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
if (need_clip && opc != Op_VectorMaskFirstTrue) {
// need_clip implies masklen < 32
andq(tmp, (1 << masklen) - 1);
}

vector_mask_operation_helper(opc, dst, tmp, masklen);
}
#endif
@@ -224,10 +224,12 @@

public:
#ifdef _LP64
void vector_mask_operation_helper(int opc, Register dst, Register tmp, int masklen);

void vector_mask_operation(int opc, Register dst, KRegister mask, Register tmp, int masklen, int masksize, int vec_enc);

void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp, XMMRegister xtmp1,
Register tmp, int masklen, int masksize, int vec_enc);
void vector_mask_operation(int opc, Register dst, XMMRegister mask, XMMRegister xtmp,
Register tmp, int masklen, BasicType bt, int vec_enc);
#endif
void string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp);
@@ -8647,43 +8647,45 @@ instruct vmask_tolong_evex(rRegL dst, kReg mask, rFlagsReg cr) %{
effect(TEMP dst, KILL cr);
format %{ "vector_tolong_evex $dst, $mask \t! vector mask tolong" %}
ins_encode %{
int mask_len = Matcher::vector_length(this, $mask);
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
if (VM_Version::supports_avx512vlbw()) {
__ kmovql($dst$$Register, $mask$$KRegister);
} else {
assert(mask_len <= 16, "");
__ kmovwl($dst$$Register, $mask$$KRegister);
}
// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
if (mask_size < 16) {
__ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
}
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$dst$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_tolong_avx(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL &&
n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
instruct vmask_tolong_bool(rRegL dst, vec mask, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskToLong mask));
format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
format %{ "vector_tolong_bool $dst, $mask \t! using $xtmp as TEMP" %}
effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$dst$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_tolong_avx(rRegL dst, vec mask, immI size, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskToLong (VectorStoreMask mask size)));
format %{ "vector_tolong_avx $dst, $mask \t! using $xtmp as TEMP" %}
effect(TEMP_DEF dst, TEMP xtmp, KILL cr);
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vpxor($xtmp$$XMMRegister, $xtmp$$XMMRegister, $xtmp$$XMMRegister, vlen_enc);
__ vpsubb($xtmp$$XMMRegister, $xtmp$$XMMRegister, $mask$$XMMRegister, vlen_enc);
__ vpmovmskb($dst$$Register, $xtmp$$XMMRegister, vlen_enc);
// Mask generated out of partial vector comparisons/replicate/mask manipulation
// operations needs to be clipped.
int mask_size = mask_len * type2aelembytes(mbt);
if (mask_size < 16) {
__ andq($dst$$Register, (((jlong)1 << mask_len) - 1));
}
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$dst$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
@@ -8699,25 +8701,40 @@ instruct vmask_truecount_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsReg cr) %{
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register,
mask_len, mask_size, vlen_enc);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$tmp$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_truecount_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
instruct vmask_truecount_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskTrueCount mask));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_truecount_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_truecount_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskTrueCount (VectorStoreMask mask size)));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_truecount_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
@@ -8734,26 +8751,42 @@ instruct vmask_first_or_last_true_evex(rRegI dst, kReg mask, rRegL tmp, rFlagsRe
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister, $tmp$$Register, mask_len,
mask_size, vlen_enc);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$KRegister,
$tmp$$Register, mask_len, mask_size, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, rRegL tmp, vec xtmp, vec xtmp1, rFlagsReg cr) %{
instruct vmask_first_or_last_true_bool(rRegI dst, vec mask, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskFirstTrue mask));
match(Set dst (VectorMaskLastTrue mask));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, TEMP xtmp1, KILL cr);
format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp and $xtmp1 as TEMP" %}
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_mask_first_or_last_true_bool $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp, vec xtmp, rFlagsReg cr) %{
predicate(n->in(1)->in(1)->bottom_type()->isa_vectmask() == NULL);
match(Set dst (VectorMaskFirstTrue (VectorStoreMask mask size)));
match(Set dst (VectorMaskLastTrue (VectorStoreMask mask size)));
effect(TEMP_DEF dst, TEMP tmp, TEMP xtmp, KILL cr);
format %{ "vector_mask_first_or_last_true_avx $dst, $mask \t! using $tmp, $xtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
BasicType mbt = Matcher::vector_element_basic_type(this, $mask);
int mask_len = Matcher::vector_length(this, $mask);
int mask_size = mask_len * type2aelembytes(mbt);
int vlen_enc = vector_length_encoding(this, $mask);
__ vector_mask_operation(opcode, $dst$$Register, $mask$$XMMRegister, $xtmp$$XMMRegister,
$xtmp1$$XMMRegister, $tmp$$Register, mask_len, mask_size, vlen_enc);
$tmp$$Register, mask_len, mbt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}