Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8288047: Accelerate Poly1305 on x86_64 using AVX512 instructions #10582

Closed
wants to merge 30 commits into from
Closed
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e3cfc74
Poly1305 AVX512 intrinsic for x86_64
vpaprotsk Sep 30, 2022
ec6c807
Merge remote-tracking branch 'vpaprotsk/master' into avx512-poly
vpaprotsk Oct 13, 2022
507d6bf
- Fix whitespace and copyright statements
vpaprotsk Oct 13, 2022
7e070d9
missed white-space fix
vpaprotsk Oct 13, 2022
f048f93
further restrict UsePolyIntrinsics with supports_avx512vlbw
vpaprotsk Oct 21, 2022
fb122f3
Merge remote-tracking branch 'origin' into avx512-poly
vpaprotsk Oct 24, 2022
d2a47f1
Merge remote-tracking branch 'origin/master' into avx512-poly
vpaprotsk Oct 24, 2022
de7e138
assembler checks and test case fixes
vpaprotsk Oct 24, 2022
883be10
extra whitespace character
vpaprotsk Oct 24, 2022
78fd8fd
invalidkeyexception and some review comments
vpaprotsk Oct 28, 2022
977e027
address Jamil's review
vpaprotsk Nov 4, 2022
38d9e83
Merge remote-tracking branch 'origin/master' into avx512-poly
vpaprotsk Nov 4, 2022
1841df1
iwanowww review
vpaprotsk Nov 8, 2022
686e061
Merge remote-tracking branch 'origin/master' into avx512-poly
vpaprotsk Nov 8, 2022
120247d
make UsePolyIntrinsics option diagnostic
vpaprotsk Nov 8, 2022
da56045
fix 32-bit build
vpaprotsk Nov 8, 2022
8b1b40f
add getLimbs to interface and reviews
vpaprotsk Nov 9, 2022
abfc68f
fix windows and 32b linux builds
vpaprotsk Nov 10, 2022
2176caf
Sandhya's review
vpaprotsk Nov 10, 2022
196ee35
jcheck
vpaprotsk Nov 10, 2022
835fbe3
live review with Sandhya
vpaprotsk Nov 11, 2022
2a225e4
Vladimir's review
vpaprotsk Nov 11, 2022
a26ac7d
Merge remote-tracking branch 'origin/master' into avx512-poly
vpaprotsk Nov 14, 2022
3fafa11
Merge remote-tracking branch 'origin/master' into avx512-poly
vpaprotsk Nov 15, 2022
8f5942d
Vladimir's review comments
vpaprotsk Nov 15, 2022
58488f4
extra whitespace
vpaprotsk Nov 15, 2022
cbf4938
use noreg properly in poly1305_limbs
vpaprotsk Nov 15, 2022
dbdfd1d
redo register alloc with explicit func params
vpaprotsk Nov 16, 2022
56aed9b
vzeroall, no spill, reg re-map
vpaprotsk Nov 17, 2022
08ea45e
remove early return
vpaprotsk Nov 17, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 117 additions & 16 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5008,6 +5008,40 @@ assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
emit_int16(0x04, (0xC0 | encode));
}

void Assembler::evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpmadd52luq(dst, k0, src1, src2, false, vector_len);
}

void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_avx512ifma(), "");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}

int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xB4, (0xC0 | encode));
}

void Assembler::evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpmadd52huq(dst, k0, src1, src2, false, vector_len);
}

void Assembler::evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_avx512ifma(), "");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}

int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xB5, (0xC0 | encode));
}

void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_avx512_vnni(), "must support vnni");
Expand Down Expand Up @@ -5425,6 +5459,42 @@ void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) {
emit_int16(0x6C, (0xC0 | encode));
}

void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpunpcklqdq(dst, k0, src1, src2, false, vector_len);
}

void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}

int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6C, (0xC0 | encode));
}

void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) {
evpunpckhqdq(dst, k0, src1, src2, false, vector_len);
}

void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
if (merge) {
attributes.reset_is_clear_context();
}

int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6D, (0xC0 | encode));
}

void Assembler::push(int32_t imm32) {
// in 64bits we push 64bits onto the stack but only
// take a 32bit immediate
Expand Down Expand Up @@ -5869,6 +5939,18 @@ void Assembler::shrdl(Register dst, Register src, int8_t imm8) {
emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
}

#ifdef _LP64
void Assembler::shldq(Register dst, Register src, int8_t imm8) {
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
emit_int32(0x0F, (unsigned char)0xA4, (0xC0 | encode), imm8);
}

void Assembler::shrdq(Register dst, Register src, int8_t imm8) {
int encode = prefixq_and_encode(src->encoding(), dst->encoding());
emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8);
}
#endif

// copies a single word from [esi] to [edi]
void Assembler::smovl() {
emit_int8((unsigned char)0xA5);
Expand Down Expand Up @@ -7740,11 +7822,12 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_
emit_operand(dst, src, 0);
}

void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xDB, (0xC0 | encode));
void Assembler::evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
evpandq(dst, k0, nds, src, false, vector_len);
}

void Assembler::evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
evpandq(dst, k0, nds, src, false, vector_len);
}

//Variable Shift packed integers logically left.
Expand Down Expand Up @@ -7857,13 +7940,13 @@ void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_l
emit_operand(dst, src, 0);
}

void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0xEB, (0xC0 | encode));
void Assembler::evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
evporq(dst, k0, nds, src, false, vector_len);
}

void Assembler::evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
evporq(dst, k0, nds, src, false, vector_len);
}

void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
Expand Down Expand Up @@ -8004,7 +8087,8 @@ void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
}

void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
Expand All @@ -8016,7 +8100,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMReg
}

void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
Expand All @@ -8031,7 +8116,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Addres
}

void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_embedded_opmask_register_specifier(mask);
Expand All @@ -8043,7 +8129,8 @@ void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegi
}

void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) {
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit);
Expand Down Expand Up @@ -8201,8 +8288,8 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address
}

void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
assert(VM_Version::supports_evex(), "requires AVX512F");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL");
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
Expand All @@ -8211,6 +8298,20 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis
emit_int8(imm8);
}

void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) {
assert(VM_Version::supports_evex(), "requires EVEX support");
assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit);
vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int8(0x25);
emit_operand(dst, src3, 1);
emit_int8(imm8);
}

void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
Expand Down
20 changes: 18 additions & 2 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1891,6 +1891,10 @@ class Assembler : public AbstractAssembler {
void pmaddwd(XMMRegister dst, XMMRegister src);
void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);

// Multiply add accumulate
void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
Expand Down Expand Up @@ -1990,6 +1994,11 @@ class Assembler : public AbstractAssembler {
// Interleave Low Quadwords
void punpcklqdq(XMMRegister dst, XMMRegister src);

void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);
void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len);
void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len);

// Vector sum of absolute difference.
void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);

Expand Down Expand Up @@ -2092,6 +2101,10 @@ class Assembler : public AbstractAssembler {
void shldl(Register dst, Register src, int8_t imm8);
void shrdl(Register dst, Register src);
void shrdl(Register dst, Register src, int8_t imm8);
#ifdef _LP64
void shldq(Register dst, Register src, int8_t imm8);
void shrdq(Register dst, Register src, int8_t imm8);
#endif

void shll(Register dst, int imm8);
void shll(Register dst);
Expand Down Expand Up @@ -2616,7 +2629,8 @@ class Assembler : public AbstractAssembler {
void pand(XMMRegister dst, XMMRegister src);
void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

// Andn packed integers
void pandn(XMMRegister dst, XMMRegister src);
Expand All @@ -2626,7 +2640,8 @@ class Assembler : public AbstractAssembler {
void por(XMMRegister dst, XMMRegister src);
void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);

// Xor packed integers
void pxor(XMMRegister dst, XMMRegister src);
Expand All @@ -2640,6 +2655,7 @@ class Assembler : public AbstractAssembler {
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len);
void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len);

// Vector compress/expand instructions.
void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
Expand Down
8 changes: 4 additions & 4 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5279,7 +5279,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg
// Get the reverse bit sequence of lower nibble of each byte.
vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
vpandq(dst, xtmp2, src, vec_enc);
evpandq(dst, xtmp2, src, vec_enc);
vpshufb(dst, xtmp1, dst, vec_enc);
vpsllq(dst, dst, 4, vec_enc);

Expand All @@ -5290,7 +5290,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg

// Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and
// right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
vporq(xtmp2, dst, xtmp2, vec_enc);
evporq(xtmp2, dst, xtmp2, vec_enc);
vector_reverse_byte(bt, dst, xtmp2, vec_enc);

} else if(vec_enc == Assembler::AVX_512bit) {
Expand Down Expand Up @@ -5345,11 +5345,11 @@ void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, X
void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src,
XMMRegister xtmp1, Register rtmp, int vec_enc) {
vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc);
vpandq(dst, xtmp1, src, vec_enc);
evpandq(dst, xtmp1, src, vec_enc);
vpsllq(dst, dst, nbits, vec_enc);
vpandn(xtmp1, xtmp1, src, vec_enc);
vpsrlq(xtmp1, xtmp1, nbits, vec_enc);
vporq(dst, dst, xtmp1, vec_enc);
evporq(dst, dst, xtmp1, vec_enc);
}

void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,6 +967,7 @@ class MacroAssembler: public Assembler {
Register g, Register h, int iteration);

void addmq(int disp, Register r1, Register r2);

This comment was marked as resolved.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leftover formatting changes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

public:
void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Expand All @@ -976,6 +977,7 @@ class MacroAssembler: public Assembler {
XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block,
XMMRegister shuf_mask);

#endif // _LP64

void fast_md5(Register buf, Address state, Address ofs, Address limit,
Expand Down
6 changes: 5 additions & 1 deletion src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2519,7 +2519,7 @@ address StubGenerator::generate_base64_decodeBlock() {
// Decode all bytes within our merged input
__ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit);
__ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit);
__ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);
__ evporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit);

// Check for error. Compare (decoded | initial) to all invalid.
// If any bytes have their high-order bit set, then we have an error.
Expand Down Expand Up @@ -3709,6 +3709,10 @@ void StubGenerator::generate_initial() {
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}

if (UsePolyIntrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}

if (UseCRC32CIntrinsics) {
bool supports_clmul = VM_Version::supports_clmul();
StubRoutines::x86::generate_CRC32C_table(supports_clmul);
Expand Down
13 changes: 13 additions & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,19 @@ class StubGenerator: public StubCodeGenerator {
// Ghash single and multi block operations using AVX instructions
address generate_avx_ghash_processBlocks();

// Poly1305 multiblock using IFMA instructions
address generate_poly1305_processBlocks();
void poly1305_process_blocks_avx512(const Register input, const Register length,
const Register A0, const Register A1, const Register A2,
const Register R0, const Register R1, const Register C1);
void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2,
const Register R0, const Register R1, const Register C1, bool only128);
void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2,
const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP);
void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128);
void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs);
void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1,
const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP);

// BASE64 stubs

Expand Down
Loading