Skip to content
This repository has been archived by the owner. It is now read-only.
Permalink
Browse files
8261553: Efficient mask generation using BMI2 BZHI instruction
Reviewed-by: redestad, neliasso
  • Loading branch information
Jatin Bhateja committed Feb 17, 2021
1 parent a065879 commit cb84539d56209a6687c4ec71a61fdbe6f06a46ea
@@ -9173,6 +9173,13 @@ void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XM
emit_int16(0x64, (0xC0 | encode));
}

void Assembler::bzhiq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xF5, (0xC0 | encode));
}

void Assembler::shlxl(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
@@ -2092,6 +2092,7 @@ class Assembler : public AbstractAssembler {
void shlxq(Register dst, Register src1, Register src2);
void shrxq(Register dst, Register src1, Register src2);

void bzhiq(Register dst, Register src1, Register src2);

//====================VECTOR ARITHMETIC=====================================
void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
@@ -1894,17 +1894,9 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
}

void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
if (ArrayCopyPartialInlineSize <= 32) {
mov64(dst, 1);
shlxq(dst, dst, len);
decq(dst);
} else {
mov64(dst, -1);
movq(temp, len);
negptr(temp);
addptr(temp, 64);
shrxq(dst, dst, temp);
}
assert(ArrayCopyPartialInlineSize <= 64,"");
mov64(dst, -1L);
bzhiq(dst, dst, len);
}
#endif // _LP64

@@ -196,10 +196,8 @@ void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister x
} else {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
assert(MaxVectorSize == 64, "vector length != 64");
negptr(length);
addq(length, 64);
mov64(temp, -1);
shrxq(temp, temp, length);
mov64(temp, -1L);
bzhiq(temp, temp, length);
kmovql(mask, temp);
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
@@ -213,9 +211,8 @@ void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister x
assert(MaxVectorSize >= 32, "vector length should be >= 32");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
mov64(temp, 1);
shlxq(temp, temp, length);
decq(temp);
mov64(temp, -1L);
bzhiq(temp, temp, length);
kmovql(mask, temp);
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_256bit);
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_256bit);
@@ -1471,6 +1471,7 @@ class StubGenerator: public StubCodeGenerator {
__ subq(temp1, loop_size[shift]);

// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
__ align(32);
__ BIND(L_main_loop);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
@@ -1537,6 +1538,7 @@ class StubGenerator: public StubCodeGenerator {

// Main loop with aligned copy block size of 192 bytes at
// 64 byte copy granularity.
__ align(32);
__ BIND(L_main_loop_64bytes);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
@@ -1676,6 +1678,7 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_main_pre_loop);

// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
__ align(32);
__ BIND(L_main_loop);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
@@ -1708,6 +1711,7 @@ class StubGenerator: public StubCodeGenerator {

// Main loop with aligned copy block size of 192 bytes at
// 64 byte copy granularity.
__ align(32);
__ BIND(L_main_loop_64bytes);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
@@ -1770,7 +1774,7 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
aligned, false, false);
}
@@ -1886,7 +1890,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
address* entry, const char *name) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
nooverlap_target, aligned, false, false);
}
@@ -1997,7 +2001,7 @@ class StubGenerator: public StubCodeGenerator {
//
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
aligned, false, false);
}
@@ -2128,7 +2132,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
address *entry, const char *name) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
nooverlap_target, aligned, false, false);
}
@@ -2232,7 +2236,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
const char *name, bool dest_uninitialized = false) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
aligned, is_oop, dest_uninitialized);
}
@@ -2343,7 +2347,7 @@ class StubGenerator: public StubCodeGenerator {
address *entry, const char *name,
bool dest_uninitialized = false) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
@@ -2456,7 +2460,7 @@ class StubGenerator: public StubCodeGenerator {
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
const char *name, bool dest_uninitialized = false) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
aligned, is_oop, dest_uninitialized);
}
@@ -2566,7 +2570,7 @@ class StubGenerator: public StubCodeGenerator {
address nooverlap_target, address *entry,
const char *name, bool dest_uninitialized = false) {
#if COMPILER2_OR_JVMCI
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
nooverlap_target, aligned, is_oop, dest_uninitialized);
}
@@ -1526,7 +1526,7 @@ const bool Matcher::match_rule_supported(int opcode) {
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (UseAVX < 3) {
if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
return false;
}
break;

0 comments on commit cb84539

Please sign in to comment.