Skip to content

Commit cb84539

Browse files
author
Jatin Bhateja
committed
8261553: Efficient mask generation using BMI2 BZHI instruction
Reviewed-by: redestad, neliasso
1 parent a065879 commit cb84539

File tree

6 files changed

+28
-27
lines changed

6 files changed

+28
-27
lines changed

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9173,6 +9173,13 @@ void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XM
91739173
emit_int16(0x64, (0xC0 | encode));
91749174
}
91759175

9176+
void Assembler::bzhiq(Register dst, Register src1, Register src2) {
9177+
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
9178+
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
9179+
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
9180+
emit_int16((unsigned char)0xF5, (0xC0 | encode));
9181+
}
9182+
91769183
void Assembler::shlxl(Register dst, Register src1, Register src2) {
91779184
assert(VM_Version::supports_bmi2(), "");
91789185
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,7 @@ class Assembler : public AbstractAssembler {
20922092
void shlxq(Register dst, Register src1, Register src2);
20932093
void shrxq(Register dst, Register src1, Register src2);
20942094

2095+
void bzhiq(Register dst, Register src1, Register src2);
20952096

20962097
//====================VECTOR ARITHMETIC=====================================
20972098
void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1894,17 +1894,9 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
18941894
}
18951895

18961896
void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
1897-
if (ArrayCopyPartialInlineSize <= 32) {
1898-
mov64(dst, 1);
1899-
shlxq(dst, dst, len);
1900-
decq(dst);
1901-
} else {
1902-
mov64(dst, -1);
1903-
movq(temp, len);
1904-
negptr(temp);
1905-
addptr(temp, 64);
1906-
shrxq(dst, dst, temp);
1907-
}
1897+
assert(ArrayCopyPartialInlineSize <= 64,"");
1898+
mov64(dst, -1L);
1899+
bzhiq(dst, dst, len);
19081900
}
19091901
#endif // _LP64
19101902

src/hotspot/cpu/x86/macroAssembler_x86_arrayCopy_avx3.cpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -196,10 +196,8 @@ void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister x
196196
} else {
197197
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
198198
assert(MaxVectorSize == 64, "vector length != 64");
199-
negptr(length);
200-
addq(length, 64);
201-
mov64(temp, -1);
202-
shrxq(temp, temp, length);
199+
mov64(temp, -1L);
200+
bzhiq(temp, temp, length);
203201
kmovql(mask, temp);
204202
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
205203
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
@@ -213,9 +211,8 @@ void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister x
213211
assert(MaxVectorSize >= 32, "vector length should be >= 32");
214212
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
215213
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
216-
mov64(temp, 1);
217-
shlxq(temp, temp, length);
218-
decq(temp);
214+
mov64(temp, -1L);
215+
bzhiq(temp, temp, length);
219216
kmovql(mask, temp);
220217
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_256bit);
221218
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_256bit);

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1471,6 +1471,7 @@ class StubGenerator: public StubCodeGenerator {
14711471
__ subq(temp1, loop_size[shift]);
14721472

14731473
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1474+
__ align(32);
14741475
__ BIND(L_main_loop);
14751476
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
14761477
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
@@ -1537,6 +1538,7 @@ class StubGenerator: public StubCodeGenerator {
15371538

15381539
// Main loop with aligned copy block size of 192 bytes at
15391540
// 64 byte copy granularity.
1541+
__ align(32);
15401542
__ BIND(L_main_loop_64bytes);
15411543
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
15421544
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
@@ -1676,6 +1678,7 @@ class StubGenerator: public StubCodeGenerator {
16761678
__ BIND(L_main_pre_loop);
16771679

16781680
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
1681+
__ align(32);
16791682
__ BIND(L_main_loop);
16801683
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
16811684
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
@@ -1708,6 +1711,7 @@ class StubGenerator: public StubCodeGenerator {
17081711

17091712
// Main loop with aligned copy block size of 192 bytes at
17101713
// 64 byte copy granularity.
1714+
__ align(32);
17111715
__ BIND(L_main_loop_64bytes);
17121716
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
17131717
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
@@ -1770,7 +1774,7 @@ class StubGenerator: public StubCodeGenerator {
17701774
//
17711775
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
17721776
#if COMPILER2_OR_JVMCI
1773-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
1777+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
17741778
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
17751779
aligned, false, false);
17761780
}
@@ -1886,7 +1890,7 @@ class StubGenerator: public StubCodeGenerator {
18861890
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
18871891
address* entry, const char *name) {
18881892
#if COMPILER2_OR_JVMCI
1889-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
1893+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
18901894
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
18911895
nooverlap_target, aligned, false, false);
18921896
}
@@ -1997,7 +2001,7 @@ class StubGenerator: public StubCodeGenerator {
19972001
//
19982002
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
19992003
#if COMPILER2_OR_JVMCI
2000-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2004+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
20012005
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
20022006
aligned, false, false);
20032007
}
@@ -2128,7 +2132,7 @@ class StubGenerator: public StubCodeGenerator {
21282132
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
21292133
address *entry, const char *name) {
21302134
#if COMPILER2_OR_JVMCI
2131-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2135+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
21322136
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
21332137
nooverlap_target, aligned, false, false);
21342138
}
@@ -2232,7 +2236,7 @@ class StubGenerator: public StubCodeGenerator {
22322236
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
22332237
const char *name, bool dest_uninitialized = false) {
22342238
#if COMPILER2_OR_JVMCI
2235-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2239+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
22362240
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
22372241
aligned, is_oop, dest_uninitialized);
22382242
}
@@ -2343,7 +2347,7 @@ class StubGenerator: public StubCodeGenerator {
23432347
address *entry, const char *name,
23442348
bool dest_uninitialized = false) {
23452349
#if COMPILER2_OR_JVMCI
2346-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2350+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
23472351
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
23482352
nooverlap_target, aligned, is_oop, dest_uninitialized);
23492353
}
@@ -2456,7 +2460,7 @@ class StubGenerator: public StubCodeGenerator {
24562460
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
24572461
const char *name, bool dest_uninitialized = false) {
24582462
#if COMPILER2_OR_JVMCI
2459-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2463+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
24602464
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
24612465
aligned, is_oop, dest_uninitialized);
24622466
}
@@ -2566,7 +2570,7 @@ class StubGenerator: public StubCodeGenerator {
25662570
address nooverlap_target, address *entry,
25672571
const char *name, bool dest_uninitialized = false) {
25682572
#if COMPILER2_OR_JVMCI
2569-
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
2573+
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
25702574
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
25712575
nooverlap_target, aligned, is_oop, dest_uninitialized);
25722576
}

src/hotspot/cpu/x86/x86.ad

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1526,7 +1526,7 @@ const bool Matcher::match_rule_supported(int opcode) {
15261526
case Op_VectorMaskGen:
15271527
case Op_LoadVectorMasked:
15281528
case Op_StoreVectorMasked:
1529-
if (UseAVX < 3) {
1529+
if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
15301530
return false;
15311531
}
15321532
break;

0 commit comments

Comments
 (0)