From e3cfc74f5fc839c09edda3d3946ff54d611b7137 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 29 Sep 2022 23:23:37 -0400 Subject: [PATCH 01/23] Poly1305 AVX512 intrinsic for x86_64 --- src/hotspot/cpu/x86/assembler_x86.cpp | 113 +++ src/hotspot/cpu/x86/assembler_x86.hpp | 16 + src/hotspot/cpu/x86/macroAssembler_x86.hpp | 13 + .../cpu/x86/macroAssembler_x86_poly.cpp | 874 ++++++++++++++++++ src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 89 ++ src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 3 + src/hotspot/cpu/x86/stubRoutines_x86.cpp | 1 + src/hotspot/cpu/x86/stubRoutines_x86.hpp | 4 +- src/hotspot/cpu/x86/vm_version_x86.cpp | 15 + src/hotspot/cpu/x86/vm_version_x86.hpp | 8 +- src/hotspot/share/classfile/vmIntrinsics.cpp | 3 + src/hotspot/share/classfile/vmIntrinsics.hpp | 8 +- src/hotspot/share/opto/c2compiler.cpp | 1 + src/hotspot/share/opto/escape.cpp | 1 + src/hotspot/share/opto/library_call.cpp | 35 + src/hotspot/share/opto/library_call.hpp | 1 + src/hotspot/share/opto/runtime.cpp | 21 + src/hotspot/share/opto/runtime.hpp | 1 + src/hotspot/share/runtime/globals.hpp | 3 + src/hotspot/share/runtime/stubRoutines.cpp | 1 + src/hotspot/share/runtime/stubRoutines.hpp | 2 + src/hotspot/share/runtime/vmStructs.cpp | 1 + .../com/sun/crypto/provider/Poly1305.java | 60 +- .../src/jdk/vm/ci/amd64/AMD64.java | 1 + .../unittest/Poly1305UnitTestDriver.java | 14 + .../provider/Poly1305IntrinsicFuzzTest.java | 93 ++ .../com/sun/crypto/provider/Poly1305KAT.java | 198 ++++ .../jdk/test/whitebox/CPUInfoTest.java | 3 +- 28 files changed, 1572 insertions(+), 11 deletions(-) create mode 100644 src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp create mode 100644 test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java create mode 100644 test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 6de343808591a..0e2760f7861df 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -4973,6 +4973,40 @@ assert(vector_len == AVX_128bit? VM_Version::supports_avx() : emit_int16(0x04, (0xC0 | encode)); } +void Assembler::evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpmadd52luq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_avx512ifma(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xB4, (0xC0 | encode)); +} + +void Assembler::evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpmadd52huq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(VM_Version::supports_avx512ifma(), ""); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int16((unsigned char)0xB5, (0xC0 | encode)); +} + void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(VM_Version::supports_avx512_vnni(), "must support vnni"); @@ -5390,6 +5424,40 @@ void Assembler::punpcklqdq(XMMRegister dst, XMMRegister src) { emit_int16(0x6C, (0xC0 | encode)); } +void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpunpcklqdq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6C, (0xC0 | encode)); +} + +void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len) { + evpunpckhqdq(dst, k0, src1, src2, false, vector_len); +} + +void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { + assert(UseAVX > 2, "requires AVX512F"); + InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_embedded_opmask_register_specifier(mask); + if (merge) { + attributes.reset_is_clear_context(); + } + + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int16(0x6D, (0xC0 | encode)); +} + void Assembler::push(int32_t imm32) { // in 64bits we push 64bits onto the stack but only // take a 32bit immediate @@ -5834,6 +5902,18 @@ void Assembler::shrdl(Register dst, Register src, int8_t imm8) { emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8); } +#ifdef _LP64 +void Assembler::shldq(Register dst, Register src, int8_t imm8) { + int encode = prefixq_and_encode(src->encoding(), dst->encoding()); + emit_int32(0x0F, (unsigned char)0xA4, (0xC0 | encode), imm8); +} + +void Assembler::shrdq(Register dst, Register src, int8_t imm8) { + int encode = prefixq_and_encode(src->encoding(), dst->encoding()); + emit_int32(0x0F, (unsigned char)0xAC, (0xC0 | encode), imm8); +} +#endif + // copies a single word from [esi] to [edi] void Assembler::smovl() { emit_int8((unsigned char)0xA5); @@ -7712,6 +7792,16 @@ void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int ve emit_int16((unsigned char)0xDB, (0xC0 | encode)); } +void Assembler::vpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xDB); + emit_operand(dst, src, 0); +} + //Variable Shift packed integers logically left. void Assembler::vpsllvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { assert(UseAVX > 1, "requires AVX2"); @@ -7829,6 +7919,15 @@ void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vec emit_int16((unsigned char)0xEB, (0xC0 | encode)); } +void Assembler::vporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + assert(VM_Version::supports_evex(), ""); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); + emit_int8((unsigned char)0xEB); + emit_operand(dst, src, 0); +} void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); @@ -8176,6 +8275,20 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegis emit_int8(imm8); } +void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len) { + assert(VM_Version::supports_evex(), "requires EVEX support"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); + assert(dst != xnoreg, "sanity"); + InstructionMark im(this); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); + vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); + emit_int8(0x25); + emit_operand(dst, src3, 1); + emit_int8(imm8); +} + void Assembler::evexpandps(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { assert(VM_Version::supports_evex(), ""); assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 0ca18b17ade40..c0afa5ef9fad7 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -1883,6 +1883,10 @@ class Assembler : public AbstractAssembler { void pmaddwd(XMMRegister dst, XMMRegister src); void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52luq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + void evpmadd52huq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpmadd52huq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); // Multiply add accumulate void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -1982,6 +1986,11 @@ class Assembler : public AbstractAssembler { // Interleave Low Quadwords void punpcklqdq(XMMRegister dst, XMMRegister src); + void evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + void evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2, int vector_len); + void evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len); + // Vector sum of absolute difference. void vpsadbw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); @@ -2084,6 +2093,10 @@ class Assembler : public AbstractAssembler { void shldl(Register dst, Register src, int8_t imm8); void shrdl(Register dst, Register src); void shrdl(Register dst, Register src, int8_t imm8); +#ifdef _LP64 + void shldq(Register dst, Register src, int8_t imm8); + void shrdq(Register dst, Register src, int8_t imm8); +#endif void shll(Register dst, int imm8); void shll(Register dst); @@ -2609,6 +2622,7 @@ class Assembler : public AbstractAssembler { void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Andn packed integers void pandn(XMMRegister dst, XMMRegister src); @@ -2619,6 +2633,7 @@ class Assembler : public AbstractAssembler { void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void vporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); @@ -2632,6 +2647,7 @@ class Assembler : public AbstractAssembler { void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); void vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len); void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len); + void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address src3, int vector_len); // Vector compress/expand instructions. void evpcompressb(XMMRegister dst, KRegister mask, XMMRegister src, bool merge, int vector_len); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 70d7adb093838..13d7afe722d84 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -967,6 +967,17 @@ class MacroAssembler: public Assembler { Register g, Register h, int iteration); void addmq(int disp, Register r1, Register r2); + + void poly1305_process_blocks_avx512(const Register input, const Register length, + const Register A0, const Register A1, const Register A2, + const Register R0, const Register R1, const Register C1); + void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP); + void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, + const Register R0, const Register R1, const Register C1, bool only128); + void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); + public: void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, @@ -976,6 +987,8 @@ class MacroAssembler: public Assembler { XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block, XMMRegister shuf_mask); + + void poly1305_process_blocks(Register input, Register length, Register accumulator, Register R); #endif // _LP64 void fast_md5(Register buf, Address state, Address ofs, Address limit, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp new file mode 100644 index 0000000000000..ba191a5c27d8f --- /dev/null +++ b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp @@ -0,0 +1,874 @@ +/* +* Copyright (c) 2022, Intel Corporation. All rights reserved. +* +* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. +* +* This code is free software; you can redistribute it and/or modify it +* under the terms of the GNU General Public License version 2 only, as +* published by the Free Software Foundation. +* +* This code is distributed in the hope that it will be useful, but WITHOUT +* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +* version 2 for more details (a copy is included in the LICENSE file that +* accompanied this code). +* +* You should have received a copy of the GNU General Public License version +* 2 along with this work; if not, write to the Free Software Foundation, +* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. +* +* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA +* or visit www.oracle.com if you need additional information or have any +* questions. +* +*/ + +#include "precompiled.hpp" +#include "asm/assembler.hpp" +#include "asm/assembler.inline.hpp" +#include "runtime/stubRoutines.hpp" +#include "macroAssembler_x86.hpp" + +#ifdef _LP64 +// References: +// - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols +// - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" +// - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design + +// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5: +// +// Reduction by 2^130-5 can be expressed as follows: +// ( a×2^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary +// = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5 +// = ( a×(2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = ( 5×a + b ) mod 2^130-5 +// QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add' +// This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment +// + +// Pseudocode for this file (in general): +// * used for poly1305_multiply_scalar +// × used for poly1305_multiply8_avx512 +// lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) +// upper-case variables are 8-element vector numbers in 3×44-bit limbs (in zmm registers) +// [ ] used to denote vector numbers (with their elements) + +// Register Map: +// GPRs: +// input = rdi +// length = rbx +// accumulator = rcx +// R = r8 +// a0 = rsi +// a1 = r9 +// a2 = r10 +// r0 = r11 +// r1 = r12 +// c1 = r8; +// t1 = r13 +// t2 = r14 +// t3 = r15 +// t0 = r14 +// polyCP = r13 +// stack(rsp, rbp) +// imul(rax, rdx) +// ZMMs: +// T: xmm0-6 +// C: xmm7-9 +// A: xmm13-18 +// B: xmm19-24 +// R: xmm25-29 + +// Constant Pool OFfsets: +enum polyCPOffset { + high_bit = 0, + mask_44 = 64, + mask_42 = 128, +}; + +// Compute product for 8 16-byte message blocks, +// i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0] +// +// Each block/number is represented by 3 44-bit limb digits, start with multiplication +// +// a2 a1 a0 +// × r2 r1 r0 +// ---------------------------------- +// a2×r0 a1×r0 a0×r0 +// + a1×r1 a0×r1 5×a2×r1' (r1' = r1<<2) +// + a0×r2 5×a2×r2' 5×a1×r2' (r2' = r2<<2) +// ---------------------------------- +// p2 p1 p0 +// +// Then, propagate the carry (bits after bit 44) from lower limbs into higher limbs. +// Then, modular reduction from upper limb wrapped to lower limbs +// +// Math Note 1: 'carry propagation' from p2 to p0 involves multiplication by 5 (i.e. slightly modified modular reduction from above): +// ( p2×2^88 ) mod 2^130-5 +// = ( p2'×2^88 + p2''×2^130) mod 2^130-5 // Split on 130-bit boudary +// = ( p2'×2^88 + p2''×2^130 - 5×p2'' + 5×p2'') mod 2^130-5 +// = ( p2'×2^88 + p2''×(2^130 - 5) + 5×p2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = ( p2'×2^88 + 5×p2'') mod 2^130-5 +// +// Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication. +// This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5. +// For example, partial product (a2×r2): +// (a2×2^88)×(r2×2^88) mod 2^130-5 +// = (a2×r2 × 2^176) mod 2^130-5 +// = (a2×r2 × 2^46×2^130) mod 2^130-5 +// = (a2×r2×2^46 × 2^130- 5×a2×r2×2^46 + 5×a2×r2×2^46) mod 2^130-5 +// = (a2×r2×2^46 × (2^130- 5) + 5×a2×r2×2^46) mod 2^130-5 // i.e. adding multiples of modulus is a noop +// = (5×a2×r2×2^46) mod 2^130-5 +// = (a2×5×r2×2^2 × 2^44) mod 2^130-5 // Align to limb boudary +// = (a2×[5×r2×4] × 2^44) mod 2^130-5 +// = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2 +// +void MacroAssembler::poly1305_multiply8_avx512( + const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP) +{ + const XMMRegister P0_L = xmm0; + const XMMRegister P0_H = xmm1; + const XMMRegister P1_L = xmm2; + const XMMRegister P1_H = xmm3; + const XMMRegister P2_L = xmm4; + const XMMRegister P2_H = xmm5; + const XMMRegister TMP1 = xmm6; + + // Reset partial sums + evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); + evpxorq(P0_H, P0_H, P0_H, Assembler::AVX_512bit); + evpxorq(P1_L, P1_L, P1_L, Assembler::AVX_512bit); + evpxorq(P1_H, P1_H, P1_H, Assembler::AVX_512bit); + evpxorq(P2_L, P2_L, P2_L, Assembler::AVX_512bit); + evpxorq(P2_H, P2_H, P2_H, Assembler::AVX_512bit); + + // Calculate partial products + evpmadd52luq(P0_L, A2, R1P, Assembler::AVX_512bit); + evpmadd52huq(P0_H, A2, R1P, Assembler::AVX_512bit); + evpmadd52luq(P1_L, A2, R2P, Assembler::AVX_512bit); + evpmadd52huq(P1_H, A2, R2P, Assembler::AVX_512bit); + evpmadd52luq(P2_L, A2, R0, Assembler::AVX_512bit); + evpmadd52huq(P2_H, A2, R0, Assembler::AVX_512bit); + + evpmadd52luq(P1_L, A0, R1, Assembler::AVX_512bit); + evpmadd52huq(P1_H, A0, R1, Assembler::AVX_512bit); + evpmadd52luq(P2_L, A0, R2, Assembler::AVX_512bit); + evpmadd52huq(P2_H, A0, R2, Assembler::AVX_512bit); + evpmadd52luq(P0_L, A0, R0, Assembler::AVX_512bit); + evpmadd52huq(P0_H, A0, R0, Assembler::AVX_512bit); + + evpmadd52luq(P0_L, A1, R2P, Assembler::AVX_512bit); + evpmadd52huq(P0_H, A1, R2P, Assembler::AVX_512bit); + evpmadd52luq(P1_L, A1, R0, Assembler::AVX_512bit); + evpmadd52huq(P1_H, A1, R0, Assembler::AVX_512bit); + evpmadd52luq(P2_L, A1, R1, Assembler::AVX_512bit); + evpmadd52huq(P2_H, A1, R1, Assembler::AVX_512bit); + + // Carry propagation: + // (Not quite aligned) | More mathematically correct: + // P2_L P1_L P0_L | P2_L×2^88 + P1_L×2^44 + P0_L×2^0 + // + P2_H P1_H P0_H | + P2_H×2^140 + P1_H×2^96 + P0_H×2^52 + // --------------------------- | ----------------------------------------------- + // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 + // + vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); + vpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + + vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); + vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); + vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); + vpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + + vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); + vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); + vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); + vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); + vpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + + vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); + vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); + vpaddq(P2_H, P2_H, TMP1, Assembler::AVX_512bit); + + // Reduction: p2->a0->a1 + // Multiply by 5 the highest bits (p2 is above 130 bits) + vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); + vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); + vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); + vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); + vpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); + vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); +} + +// Compute product for a single 16-byte message blocks +// - Assumes that r = [r1 r0] is only 128 bits (not 130) +// - When only128 is set, Input [a2 a1 a0] is 128 bits (i.e. a2==0) +// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used) +// +// Note 1: a2 here is only two bits so anything above is subject of reduction. +// Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations +// +// Flow of the code below is as follows: +// +// a2 a1 a0 +// x r1 r0 +// ----------------------------- +// a2×r0 a1×r0 a0×r0 +// + a0×r1 +// + 5xa2xr1 5xa1xr1 +// ----------------------------- +// [0|L2L] [L1H|L1L] [L0H|L0L] +// +// Registers: t3:t2 t1:a0 +// +// Completing the multiply and adding (with carry) 3x128-bit limbs into +// 192-bits again (3x64-bits): +// a0 = L0L +// a1 = L0H + L1L +// t3 = L1H + L2L +void MacroAssembler::poly1305_multiply_scalar( + const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1, bool only128) +{ + const Register t1 = r13; + const Register t2 = r14; + const Register t3 = r15; + // Note mulq instruction requires/clobers rax, rdx + + // t3:t2 = (a0 * r1) + movq(rax, r1); + mulq(a0); + movq(t2, rax); + movq(t3, rdx); + + // t1:a0 = (a0 * r0) + movq(rax, r0); + mulq(a0); + movq(a0, rax); // a0 not used in other operations + movq(t1, rdx); + + // t3:t2 += (a1 * r0) + movq(rax, r0); + mulq(a1); + addq(t2, rax); + adcq(t3, rdx); + + // t1:a0 += (a1 * r1x5) + movq(rax, c1); + mulq(a1); + addq(a0, rax); + adcq(t1, rdx); + + // Note: a2 is clamped to 2-bits, + // r1/r0 is clamped to 60-bits, + // their product is less than 2^64. + + if (only128) { // Accumulator only 128 bits, i.e. a2 == 0 + // just move and add t1-t2 to a1 + movq(a1, t1); + addq(a1, t2); + adcq(t3, 0); + } else { + // t3:t2 += (a2 * r1x5) + movq(a1, a2); // use a1 for a2 + imulq(a1, c1); + addq(t2, a1); + adcq(t3, 0); + + movq(a1, t1); // t1:a0 => a1:a0 + + // t3:a1 += (a2 * r0):t2 + imulq(a2, r0); + addq(a1, t2); + adcq(t3, a2); + } + + // At this point, 3 64-bit limbs are in t3:a1:a0 + // t3 can span over more than 2 bits so final partial reduction step is needed. + // + // Partial reduction (just to fit into 130 bits) + // a2 = t3 & 3 + // k = (t3 & ~3) + (t3 >> 2) + // Y x4 + Y x1 + // a2:a1:a0 += k + // + // Result will be in a2:a1:a0 + movq(t1, t3); + movl(a2, t3); // DWORD + andq(t1, ~3); + shrq(t3, 2); + addq(t1, t3); + andl(a2, 3); // DWORD + + // a2:a1:a0 += k (kept in t1) + addq(a0, t1); + adcq(a1, 0); + adcl(a2, 0); // DWORD +} + +// Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2) +// Optionally pad all the numbers (i.e. add 2^128) +// +// +-------------------------+-------------------------+ +// D0:D1 | h0 h1 g0 g1 f0 f1 e0 e1 | d0 d1 c0 c1 b0 b1 a0 a1 | +// +-------------------------+-------------------------+ +// +-------------------------+ +// L2 | h2 d2 g2 c2 f2 b2 e2 a2 | +// +-------------------------+ +// +-------------------------+ +// L1 | h1 d1 g1 c1 f1 b1 e1 a1 | +// +-------------------------+ +// +-------------------------+ +// L0 | h0 d0 g0 c0 f0 b0 e0 a0 | +// +-------------------------+ +// +void MacroAssembler::poly1305_limbs_avx512( + const XMMRegister D0, const XMMRegister D1, + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP) +{ + const XMMRegister TMP1 = xmm0; + const XMMRegister TMP2 = xmm1; + // Interleave blocks of data + evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); + evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); + + // Highest 42-bit limbs of new blocks + vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); + if (padMSG) { + vporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message + } + + // Middle 44-bit limbs of new blocks + vpsrlq(L1, L0, 44, Assembler::AVX_512bit); + vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); + vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) + + // Lowest 44-bit limbs of new blocks + vpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); +} + +// This function consumes as many whole 16*16-byte blocks as available in input +// After execution, input and length will point at remaining (unprocessed) data +// and [a2 a1 a0] will contain the current accumulator value +// +// Math Note: +// Put simply, main loop in this function multiplies each message block by r^16; why this works? 'Math' happens before and after.. why as follows: +// +// hash = ((((m1*r + m2)*r + m3)*r ... mn)*r +// = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r // Horner's rule +// +// = m1*r^n + m4*r^(n-4) + m8*r^(n-8) ... // split into 4 groups for brevity, same applies to 16 +// + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ... +// + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ... +// + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ... +// +// = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3) // factor out r^4..r; same applies to 16 but r^16..r factors +// + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2) +// + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1) +// + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0) // Note last message group has no multiplier +// +// = r^4 * (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) // reverse Horner's rule, for each group +// + r^3 * (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) +// + r^2 * (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) +// + r^1 * (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) +// +// Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" +// +// Pseudocode for this function: +// * used for poly1305_multiply_scalar +// × used for poly1305_multiply8_avx512 +// lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) +// upper-case variables are 8&16-element vector numbers in 3×44-bit limbs (in zmm registers) +// +// C = a // [0 0 0 0 0 0 0 a] +// AL = limbs(input) +// AH = limbs(input+8) +// AL = AL + C +// input+=16, length-=16 +// +// a = r +// a = a*r +// r^2 = a +// a = a*r +// r^3 = a +// r = a*r +// r^4 = a +// +// T = r^4 || r^3 || r^2 || r +// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ] +// C = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1] +// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4] +// B = B×R // [r^8 0 r^7 0 r^6 0 r^5 0 ] +// B = B | C // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1] +// push(B) +// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8] +// B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9] +// push(B) +// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] +// +// for (;length>=16; input+=16, length-=16) +// BL = limbs(input) +// BH = limbs(input+8) +// AL = AL × R +// AH = AH × R +// AL = AL + BL +// AH = AH + BH +// +// B = pop() +// R = pop() +// AL = AL × R +// AH = AH × B +// A = AL + AH // 16->8 blocks +// T = A >> 4 // 8 ->4 blocks +// A = A + T +// T = A >> 2 // 4 ->2 blocks +// A = A + T +// T = A >> 1 // 2 ->1 blocks +// A = A + T +// a = A +void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const Register length, + const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1) +{ + Label L_process256Loop, L_process256LoopDone; + // Register Map: + // reserved: rsp, rbp, rcx + // PARAMs: rdi, rbx, rsi, r8-r12 + // poly1305_multiply_scalar clobbers: r13-r15, rax, rdx + const Register t0 = r14; + const Register t1 = r13; + const Register polyCP = r13; + + // poly1305_limbs_avx512 clobbers: xmm0, xmm1 + // poly1305_multiply8_avx512 clobbers: xmm0-xmm6 + const XMMRegister T0 = xmm2; + const XMMRegister T1 = xmm3; + const XMMRegister T2 = xmm4; + + const XMMRegister C0 = xmm7; + const XMMRegister C1 = xmm8; + const XMMRegister C2 = xmm9; + + const XMMRegister A0 = xmm13; + const XMMRegister A1 = xmm14; + const XMMRegister A2 = xmm15; + const XMMRegister A3 = xmm16; + const XMMRegister A4 = xmm17; + const XMMRegister A5 = xmm18; + + const XMMRegister B0 = xmm19; + const XMMRegister B1 = xmm20; + const XMMRegister B2 = xmm21; + const XMMRegister B3 = xmm22; + const XMMRegister B4 = xmm23; + const XMMRegister B5 = xmm24; + + const XMMRegister R0 = xmm25; + const XMMRegister R1 = xmm26; + const XMMRegister R2 = xmm27; + const XMMRegister R1P = xmm28; + const XMMRegister R2P = xmm29; + + subq(rsp, 512/8*6); // Make room to store 6 zmm registers (powers of R) + lea(polyCP, ExternalAddress(StubRoutines::x86::poly1305_mask_addr())); + + // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 + movq(t0, a0); + andq(t0, Address(polyCP, mask_44)); // First limb (Acc[43:0]) + movq(C0, t0); + + movq(t0, a1); + shrdq(a0, t0, 44); + andq(a0, Address(polyCP, mask_44)); // Second limb (Acc[77:52]) + movq(C1, a0); + + shrdq(a1, a2, 24); + andq(a1, Address(polyCP, mask_42)); // Third limb (Acc[129:88]) + movq(C2, a1); + + // To add accumulator, we must unroll first loop iteration + + // Load first block of data (128 bytes) and pad + // A0 to have bits 0-43 of all 8 blocks in 8 qwords + // A1 to have bits 87-44 of all 8 blocks in 8 qwords + // A2 to have bits 127-88 of all 8 blocks in 8 qwords + evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); + evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T1, A0, A1, A2, true, polyCP); + + // Add accumulator to the fist message block + vpaddq(A0, A0, C0, Assembler::AVX_512bit); + vpaddq(A1, A1, C1, Assembler::AVX_512bit); + vpaddq(A2, A2, C2, Assembler::AVX_512bit); + + // Load next blocks of data (128 bytes) and pad + // A3 to have bits 0-43 of all 8 blocks in 8 qwords + // A4 to have bits 87-44 of all 8 blocks in 8 qwords + // A5 to have bits 127-88 of all 8 blocks in 8 qwords + evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); + evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, polyCP); + + subl(length, 16*16); + lea(input, Address(input,16*16)); + + // Compute the powers of R^1..R^4 and form 44-bit limbs of each + // T0 to have bits 0-127 in 4 quadword pairs + // T1 to have bits 128-129 in alternating 8 qwords + vpxorq(T1, T1, T1, Assembler::AVX_512bit); + movq(T2, r0); + vpinsrq(T2, T2, r1, 1); + vinserti32x4(T0, T0, T2, 3); + + // Calculate R^2 + movq(a0, r0); + movq(a1, r1); + // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator + poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, true); + + movq(T2, a0); + vpinsrq(T2, T2, a1, 1); + vinserti32x4(T0, T0, T2, 2); + movq(T2, a2); + vinserti32x4(T1, T1, T2, 2); + + // Calculate R^3 + poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + + movq(T2, a0); + vpinsrq(T2, T2, a1, 1); + vinserti32x4(T0, T0, T2, 1); + movq(T2, a2); + vinserti32x4(T1, T1, T2, 1); + + // Calculate R^4 + poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + + movq(T2, a0); + vpinsrq(T2, T2, a1, 1); + vinserti32x4(T0, T0, T2, 0); + movq(T2, a2); + vinserti32x4(T1, T1, T2, 0); + + // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) + // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords + // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords + // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords + lea(polyCP, ExternalAddress(StubRoutines::x86::poly1305_mask_addr())); + vpxorq(T2, T2, T2, Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, polyCP); + + // T1 contains the 2 highest bits of the powers of R + vpsllq(T1, T1, 40, Assembler::AVX_512bit); + vporq(B2, B2, T1, Assembler::AVX_512bit); + + // Broadcast 44-bit limbs of R^4 into R0,R1,R2 + mov(t0, a0); + andq(t0, Address(polyCP, mask_44)); // First limb (R^4[43:0]) + evpbroadcastq(R0, t0, Assembler::AVX_512bit); + + movq(t0, a1); + shrdq(a0, t0, 44); + andq(a0, Address(polyCP, mask_44)); // Second limb (R^4[87:44]) + evpbroadcastq(R1, a0, Assembler::AVX_512bit); + + shrdq(a1, a2, 24); + andq(a1, Address(polyCP, mask_42)); // Third limb (R^4[129:88]) + evpbroadcastq(R2, a1, Assembler::AVX_512bit); + + // Generate 4*5*R^4 into {R2P,R1P} + // Used as multiplier in poly1305_multiply8_avx512 so can + // ignore bottom limb and carry propagation + vpsllq(R1P, R1, 2, Assembler::AVX_512bit); // 4*R^4 + vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^4 + vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^4 + vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // Move R^4..R^1 one element over + vpslldq(C0, B0, 8, Assembler::AVX_512bit); + vpslldq(C1, B1, 8, Assembler::AVX_512bit); + vpslldq(C2, B2, 8, Assembler::AVX_512bit); + + // Calculate R^8-R^5 + poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 + R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4 + polyCP); + + // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R + vporq(B0, B0, C0, Assembler::AVX_512bit); + vporq(B1, B1, C1, Assembler::AVX_512bit); + vporq(B2, B2, C2, Assembler::AVX_512bit); + + // Broadcast R^8 + vpbroadcastq(R0, B0, Assembler::AVX_512bit); + vpbroadcastq(R1, B1, Assembler::AVX_512bit); + vpbroadcastq(R2, B2, Assembler::AVX_512bit); + + // Generate 4*5*R^8 + vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^8 + vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // Store R^8-R for later use + evmovdquq(Address(rsp, 64*0), B0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*1), B1, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*2), B2, Assembler::AVX_512bit); + + // Calculate R^16-R^9 + poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 + R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8 + polyCP); + + // Store R^16-R^9 for later use + evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*4), B1, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*5), B2, Assembler::AVX_512bit); + + // Broadcast R^16 + vpbroadcastq(R0, B0, Assembler::AVX_512bit); + vpbroadcastq(R1, B1, Assembler::AVX_512bit); + vpbroadcastq(R2, B2, Assembler::AVX_512bit); + + // Generate 4*5*R^16 + vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^16 + vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16 + vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + // VECTOR LOOP: process 16 * 16-byte message block at a time + bind(L_process256Loop); + cmpl(length, 16*16); + jcc(Assembler::less, L_process256LoopDone); + + // Load and interleave next block of data (128 bytes) + evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); + evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T1, B0, B1, B2, true, polyCP); + + // Load and interleave next block of data (128 bytes) + evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); + evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T1, B3, B4, B5, true, polyCP); + + poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P, //R^16..R^16, 4*5*R^16 + polyCP); + poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P, //R^16..R^16, 4*5*R^16 + polyCP); + + vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + vpaddq(A2, A2, B2, Assembler::AVX_512bit); //Add highest bits from new blocks to accumulator + vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator + + subl(length, 16*16); + lea(input, Address(input,16*16)); + jmp(L_process256Loop); + + bind(L_process256LoopDone); + + // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value + // Read R^16-R^9 + evmovdquq(B0, Address(rsp, 64*3), Assembler::AVX_512bit); + evmovdquq(B1, Address(rsp, 64*4), Assembler::AVX_512bit); + evmovdquq(B2, Address(rsp, 64*5), Assembler::AVX_512bit); + // Read R^8-R + evmovdquq(R0, Address(rsp, 64*0), Assembler::AVX_512bit); + evmovdquq(R1, Address(rsp, 64*1), Assembler::AVX_512bit); + evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); + + // Generate 4*5*[R^16..R^9] (ignore lowest limb) + vpsllq(T0, B1, 2, Assembler::AVX_512bit); + vpaddq(B3, B1, T0, Assembler::AVX_512bit); // R1' (R1*5) + vpsllq(T0, B2, 2, Assembler::AVX_512bit); + vpaddq(B4, B2, T0, Assembler::AVX_512bit); // R2' (R2*5) + vpsllq(B3, B3, 2, Assembler::AVX_512bit); // 4*5*R + vpsllq(B4, B4, 2, Assembler::AVX_512bit); + + // Generate 4*5*[R^8..R^1] (ignore lowest limb) + vpsllq(T0, R1, 2, Assembler::AVX_512bit); + vpaddq(R1P, R1, T0, Assembler::AVX_512bit); // R1' (R1*5) + vpsllq(T0, R2, 2, Assembler::AVX_512bit); + vpaddq(R2P, R2, T0, Assembler::AVX_512bit); // R2' (R2*5) + vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R + vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + + poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks + B0, B1, B2, B3, B4, // R^16-R^9, R1P, R2P + polyCP); + poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P, // R^8-R, R1P, R2P + polyCP); + + // Add all blocks (horizontally) + // 16->8 blocks + vpaddq(A0, A0, A3, Assembler::AVX_512bit); + vpaddq(A1, A1, A4, Assembler::AVX_512bit); + vpaddq(A2, A2, A5, Assembler::AVX_512bit); + + // 8 -> 4 blocks + vextracti64x4(T0, A0, 1); + vextracti64x4(T1, A1, 1); + vextracti64x4(T2, A2, 1); + vpaddq(A0, A0, T0, Assembler::AVX_256bit); + vpaddq(A1, A1, T1, Assembler::AVX_256bit); + vpaddq(A2, A2, T2, Assembler::AVX_256bit); + + // 4 -> 2 blocks + vextracti32x4(T0, A0, 1); + vextracti32x4(T1, A1, 1); + vextracti32x4(T2, A2, 1); + vpaddq(A0, A0, T0, Assembler::AVX_128bit); + vpaddq(A1, A1, T1, Assembler::AVX_128bit); + vpaddq(A2, A2, T2, Assembler::AVX_128bit); + + // 2 -> 1 blocks + vpsrldq(T0, A0, 8, Assembler::AVX_128bit); + vpsrldq(T1, A1, 8, Assembler::AVX_128bit); + vpsrldq(T2, A2, 8, Assembler::AVX_128bit); + + // Finish folding and clear second qword + mov64(t0, 0xfd); + kmovql(k1, t0); + evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); + evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); + evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); + + // Carry propagation + vpsrlq(T0, A0, 44, Assembler::AVX_512bit); + vpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + vpaddq(A1, A1, T0, Assembler::AVX_512bit); + vpsrlq(T0, A1, 44, Assembler::AVX_512bit); + vpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + vpaddq(A2, A2, T0, Assembler::AVX_512bit); + vpsrlq(T0, A2, 42, Assembler::AVX_512bit); + vpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + vpsllq(T1, T0, 2, Assembler::AVX_512bit); + vpaddq(T0, T0, T1, Assembler::AVX_512bit); + vpaddq(A0, A0, T0, Assembler::AVX_512bit); + + // Put together A (accumulator) + movq(a0, A0); + + movq(t0, A1); + movq(t1, t0); + shlq(t1, 44); + orq(a0, t1); + + shrq(t0, 20); + movq(a2, A2); + movq(a1, a2); + shlq(a1, 24); + orq(a1, t0); + shrq(a2, 40); + + // Cleanup + vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); + vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); + vpxorq(T0, T0, T0, Assembler::AVX_512bit); + vpxorq(T1, T1, T1, Assembler::AVX_512bit); + vpxorq(T2, T2, T2, Assembler::AVX_512bit); + vpxorq(C0, C0, C0, Assembler::AVX_512bit); + vpxorq(C1, C1, C1, Assembler::AVX_512bit); + vpxorq(C2, C2, C2, Assembler::AVX_512bit); + vpxorq(A0, A0, A0, Assembler::AVX_512bit); + vpxorq(A1, A1, A1, Assembler::AVX_512bit); + vpxorq(A2, A2, A2, Assembler::AVX_512bit); + vpxorq(A3, A3, A3, Assembler::AVX_512bit); + vpxorq(A4, A4, A4, Assembler::AVX_512bit); + vpxorq(A5, A5, A5, Assembler::AVX_512bit); + vpxorq(B0, B0, B0, Assembler::AVX_512bit); + vpxorq(B1, B1, B1, Assembler::AVX_512bit); + vpxorq(B2, B2, B2, Assembler::AVX_512bit); + vpxorq(B3, B3, B3, Assembler::AVX_512bit); + vpxorq(B4, B4, B4, Assembler::AVX_512bit); + vpxorq(B5, B5, B5, Assembler::AVX_512bit); + vpxorq(R0, R0, R0, Assembler::AVX_512bit); + vpxorq(R1, R1, R1, Assembler::AVX_512bit); + vpxorq(R2, R2, R2, Assembler::AVX_512bit); + vpxorq(R1P, R1P, R1P, Assembler::AVX_512bit); + vpxorq(R2P, R2P, R2P, Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*3), Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*4), Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*5), Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*0), Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*1), Assembler::AVX_512bit); + evmovdquq(A0, Address(rsp, 64*2), Assembler::AVX_512bit); + addq(rsp, 512/8*6); // (powers of R) +} + +// This function consumes as many whole 16-byte blocks as available in input +// After execution, input and length will point at remaining (unprocessed) data +// and accumulator will point to the current accumulator value +// +void MacroAssembler::poly1305_process_blocks(Register input, Register length, Register accumulator, Register R) +{ + // Register Map: + // input = rdi; + // length = rbx; + // accumulator = rcx; + // R = r8; + + const Register a0 = rsi; // [in/out] accumulator bits 63..0 + const Register a1 = r9; // [in/out] accumulator bits 127..64 + const Register a2 = r10; // [in/out] accumulator bits 195..128 + const Register r0 = r11; // R constant bits 63..0 + const Register r1 = r12; // R constant bits 127..64 + const Register c1 = r8; // 5*R (upper limb only) + + Label L_process16Loop, L_process16LoopDone; + + // Load R + movq(r0, Address(R, 0)); + movq(r1, Address(R, 8)); + + // Compute 5*R (Upper limb only) + movq(c1, r1); + shrq(c1, 2); + addq(c1, r1); // c1 = r1 + (r1 >> 2) + + // Load accumulator + movq(a0, Address(accumulator, 0)); + movq(a1, Address(accumulator, 8)); + movzbq(a2, Address(accumulator, 16)); + + // VECTOR LOOP: Minimum of 256 bytes to run vectorized code + cmpl(length, 16*16); + jcc(Assembler::less, L_process16Loop); + + poly1305_process_blocks_avx512(input, length, + a0, a1, a2, + r0, r1, c1); + + // SCALAR LOOP: process one 16-byte message block at a time + bind(L_process16Loop); + cmpl(length, 16); + jcc(Assembler::less, L_process16LoopDone); + + addq(a0, Address(input,0)); + adcq(a1, Address(input,8)); + adcq(a2,1); + poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + + subl(length, 16); + lea(input, Address(input,16)); + jmp(L_process16Loop); + bind(L_process16LoopDone); + + // Write output + movq(Address(accumulator, 0), a0); + movq(Address(accumulator, 8), a1); + movb(Address(accumulator, 16), a2); +} + +#endif // _LP64 diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 49bd30d12d8db..6a2256edbe0e7 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -1955,6 +1955,90 @@ address StubGenerator::generate_base64_encodeBlock() return start; } +address StubGenerator::generate_poly1305_masksCP() { + StubCodeMark mark(this, "StubRoutines", "generate_poly1305_masksCP"); + address start = __ pc(); + // OFFSET 0: high_bit + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + __ emit_data64(0x0000010000000000, relocInfo::none); + + // OFFSET 64: mask_44 + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + __ emit_data64(0xfffffffffff, relocInfo::none); + + // OFFSET 128: mask_42 + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + __ emit_data64(0x3ffffffffff, relocInfo::none); + + return start; +} + +address StubGenerator::generate_poly1305_processBlocks() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); + address start = __ pc(); + __ enter(); + + // Save all 'SOE' registers + __ push(rbx); + #ifdef _WIN64 + __ push(rsi); + __ push(rdi); + #endif + __ push(r12); + __ push(r13); + __ push(r14); + __ push(r15); + + // Normalize input + // JAVA: void processBlocks(byte[] input, int offset, int len, byte[] a, byte[] r) + const Register input = rdi; //input+offset + const Register length = rbx; + const Register accumulator = rcx; + const Register R = r8; + + __ lea(input, Address(c_rarg0, c_rarg1)); + __ mov(length, c_rarg2); + #ifdef _WIN64 // R and acc already in correct position for linux + __ mov(accumulator, r9); // arg#3 - acc + __ movptr(R, Address(rbp, 6 * wordSize)); // arg#4 - R + #endif + + __ poly1305_process_blocks(input, length, accumulator, R); + + __ pop(r15); + __ pop(r14); + __ pop(r13); + __ pop(r12); + #ifdef _WIN64 + __ pop(rdi); + __ pop(rsi); + #endif + __ pop(rbx); + + __ leave(); + __ ret(0); + return start; +} + // base64 AVX512vbmi tables address StubGenerator::base64_vbmi_lookup_lo_addr() { __ align64(); @@ -3665,6 +3749,11 @@ void StubGenerator::generate_initial() { StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); } + if (UsePolyIntrinsics) { + StubRoutines::x86::_poly1305_mask_addr = generate_poly1305_masksCP(); + StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); + } + if (UseCRC32CIntrinsics) { bool supports_clmul = VM_Version::supports_clmul(); StubRoutines::x86::generate_CRC32C_table(supports_clmul); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 7d5e25de381a9..d9600aa0f1bbf 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -387,6 +387,9 @@ class StubGenerator: public StubCodeGenerator { // Ghash single and multi block operations using AVX instructions address generate_avx_ghash_processBlocks(); + // Poly1305 multiblock using IFMA instructions + address generate_poly1305_masksCP(); + address generate_poly1305_processBlocks(); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index 8f285115538e2..f1c14eda6fcf4 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -80,6 +80,7 @@ address StubRoutines::x86::_join_0_1_base64 = NULL; address StubRoutines::x86::_join_1_2_base64 = NULL; address StubRoutines::x86::_join_2_3_base64 = NULL; address StubRoutines::x86::_decoding_table_base64 = NULL; +address StubRoutines::x86::_poly1305_mask_addr = NULL; #endif address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL; diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index 989536da2a552..abf82cfc31191 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -33,7 +33,7 @@ static bool returns_to_call_stub(address return_pc) { return return_pc == _call_ enum platform_dependent_constants { code_size1 = 20000 LP64_ONLY(+10000), // simply increase if too small (assembler will crash if too small) - code_size2 = 35300 LP64_ONLY(+35000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small) + code_size2 = 35300 LP64_ONLY(+45000) WINDOWS_ONLY(+2048) // simply increase if too small (assembler will crash if too small) }; class x86 { @@ -192,6 +192,7 @@ class x86 { static address _join_1_2_base64; static address _join_2_3_base64; static address _decoding_table_base64; + static address _poly1305_mask_addr; #endif // byte flip mask for sha256 static address _pshuffle_byte_flip_mask_addr; @@ -323,6 +324,7 @@ class x86 { static address base64_vbmi_join_1_2_addr() { return _join_1_2_base64; } static address base64_vbmi_join_2_3_addr() { return _join_2_3_base64; } static address base64_decoding_table_addr() { return _decoding_table_base64; } + static address poly1305_mask_addr() { return _poly1305_mask_addr;} #endif static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; } static void generate_CRC32C_table(bool is_pclmulqdq_supported); diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index c63f731cb63f3..278f0109a23d1 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -922,6 +922,7 @@ void VM_Version::get_processor_features() { _features &= ~CPU_AVX512_VBMI; _features &= ~CPU_AVX512_VBMI2; _features &= ~CPU_AVX512_BITALG; + _features &= ~CPU_AVX512_IFMA; } if (UseAVX < 2) @@ -1173,6 +1174,18 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); } +#ifdef _LP64 + if (supports_avx512ifma()) { + if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { + FLAG_SET_DEFAULT(UsePolyIntrinsics, true); + } + } else +#endif + if (UsePolyIntrinsics) { + warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UsePolyIntrinsics, false); + } + #ifdef _LP64 // These are only supported on 64-bit if (UseSHA && supports_avx2() && supports_bmi2()) { @@ -2894,6 +2907,8 @@ uint64_t VM_Version::feature_flags() { result |= CPU_AVX512CD; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512dq != 0) result |= CPU_AVX512DQ; + if (_cpuid_info.sef_cpuid7_ebx.bits.avx512ifma != 0) + result |= CPU_AVX512_IFMA; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512pf != 0) result |= CPU_AVX512PF; if (_cpuid_info.sef_cpuid7_ebx.bits.avx512er != 0) diff --git a/src/hotspot/cpu/x86/vm_version_x86.hpp b/src/hotspot/cpu/x86/vm_version_x86.hpp index 256c61e1c5894..2765f152816ad 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.hpp +++ b/src/hotspot/cpu/x86/vm_version_x86.hpp @@ -222,7 +222,9 @@ class VM_Version : public Abstract_VM_Version { avx512dq : 1, : 1, adx : 1, - : 3, + : 1, + avx512ifma : 1, + : 1, clflushopt : 1, clwb : 1, : 1, @@ -374,7 +376,8 @@ class VM_Version : public Abstract_VM_Version { decl(RDPID, "rdpid", 49) /* RDPID instruction */ \ decl(FSRM, "fsrm", 50) /* Fast Short REP MOV */ \ decl(GFNI, "gfni", 51) /* Vector GFNI instructions */ \ - decl(AVX512_BITALG, "avx512_bitalg", 52) /* Vector sub-word popcount and bit gather instructions */ + decl(AVX512_BITALG, "avx512_bitalg", 52) /* Vector sub-word popcount and bit gather instructions */ \ + decl(AVX512_IFMA, "avx512_ifma", 53) /* Integer Vector FMA instructions*/ #define DECLARE_CPU_FEATURE_FLAG(id, name, bit) CPU_##id = (1ULL << bit), CPU_FEATURE_FLAGS(DECLARE_CPU_FEATURE_FLAG) @@ -654,6 +657,7 @@ class VM_Version : public Abstract_VM_Version { static bool supports_adx() { return (_features & CPU_ADX) != 0; } static bool supports_evex() { return (_features & CPU_AVX512F) != 0; } static bool supports_avx512dq() { return (_features & CPU_AVX512DQ) != 0; } + static bool supports_avx512ifma() { return (_features & CPU_AVX512_IFMA) != 0; } static bool supports_avx512pf() { return (_features & CPU_AVX512PF) != 0; } static bool supports_avx512er() { return (_features & CPU_AVX512ER) != 0; } static bool supports_avx512cd() { return (_features & CPU_AVX512CD) != 0; } diff --git a/src/hotspot/share/classfile/vmIntrinsics.cpp b/src/hotspot/share/classfile/vmIntrinsics.cpp index 67596bac13af6..4cd4073535974 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.cpp +++ b/src/hotspot/share/classfile/vmIntrinsics.cpp @@ -479,6 +479,9 @@ bool vmIntrinsics::disabled_by_jvm_flags(vmIntrinsics::ID id) { case vmIntrinsics::_base64_decodeBlock: if (!UseBASE64Intrinsics) return true; break; + case vmIntrinsics::_poly1305_processBlocks: + if (!UsePolyIntrinsics) return true; + break; case vmIntrinsics::_updateBytesCRC32C: case vmIntrinsics::_updateDirectByteBufferCRC32C: if (!UseCRC32CIntrinsics) return true; diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 7a989492df6cc..cfc7e2e0a54c5 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -513,7 +513,13 @@ class methodHandle; do_class(java_util_Base64_Decoder, "java/util/Base64$Decoder") \ do_intrinsic(_base64_decodeBlock, java_util_Base64_Decoder, decodeBlock_name, decodeBlock_signature, F_R) \ do_name(decodeBlock_name, "decodeBlock") \ - do_signature(decodeBlock_signature, "([BII[BIZZ)I") \ + do_signature(decodeBlock_signature, "([BII[BIZZ)I") \ + \ + /* support for com.sun.crypto.provider.Poly1305 */ \ + do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \ + do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, polyBlock_signature, F_S) \ + do_name(processMultipleBlocks_name, "processMultipleBlocks") \ + do_signature(polyBlock_signature, "([BII[B[B)V") \ \ /* support for com.sun.crypto.provider.GHASH */ \ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ diff --git a/src/hotspot/share/opto/c2compiler.cpp b/src/hotspot/share/opto/c2compiler.cpp index 489edd130fbbf..94037e13c26e3 100644 --- a/src/hotspot/share/opto/c2compiler.cpp +++ b/src/hotspot/share/opto/c2compiler.cpp @@ -733,6 +733,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt case vmIntrinsics::_ghash_processBlocks: case vmIntrinsics::_base64_encodeBlock: case vmIntrinsics::_base64_decodeBlock: + case vmIntrinsics::_poly1305_processBlocks: case vmIntrinsics::_updateCRC32: case vmIntrinsics::_updateBytesCRC32: case vmIntrinsics::_updateByteBufferCRC32: diff --git a/src/hotspot/share/opto/escape.cpp b/src/hotspot/share/opto/escape.cpp index af5cae980aa9c..a088e22357a62 100644 --- a/src/hotspot/share/opto/escape.cpp +++ b/src/hotspot/share/opto/escape.cpp @@ -1152,6 +1152,7 @@ void ConnectionGraph::process_call_arguments(CallNode *call) { strcmp(call->as_CallLeaf()->_name, "electronicCodeBook_decryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "counterMode_AESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "galoisCounterMode_AESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "poly1305_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "encodeBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "decodeBlock") == 0 || diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 589608a1d85b5..3b275b0b204ad 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -610,6 +610,8 @@ bool LibraryCallKit::try_to_inline(int predicate) { return inline_base64_encodeBlock(); case vmIntrinsics::_base64_decodeBlock: return inline_base64_decodeBlock(); + case vmIntrinsics::_poly1305_processBlocks: + return inline_poly1305_processBlocks(); case vmIntrinsics::_encodeISOArray: case vmIntrinsics::_encodeByteISOArray: @@ -6961,6 +6963,39 @@ bool LibraryCallKit::inline_base64_decodeBlock() { return true; } +bool LibraryCallKit::inline_poly1305_processBlocks() { + address stubAddr; + const char *stubName; + assert(UsePolyIntrinsics, "need Poly intrinsics support"); + assert(callee()->signature()->size() == 5, "poly1305_processBlocks has 5 parameters"); + stubAddr = StubRoutines::poly1305_processBlocks(); + stubName = "poly1305_processBlocks"; + + if (!stubAddr) return false; + Node* input = argument(0); + Node* input_offset = argument(1); + Node* len = argument(2); + Node* acc = argument(3); + Node* r = argument(4); + + input = must_be_not_null(input, true); + acc = must_be_not_null(acc, true); + r = must_be_not_null(r, true); + + Node* input_start = array_element_address(input, intcon(0), T_BYTE); + assert(input_start, "input array is NULL"); + Node* acc_start = array_element_address(acc, intcon(0), T_BYTE); + assert(acc_start, "acc array is NULL"); + Node* r_start = array_element_address(r, intcon(0), T_BYTE); + assert(r_start, "r array is NULL"); + + Node* call = make_runtime_call(RC_LEAF, + OptoRuntime::poly1305_processBlocks_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + input_start, input_offset, len, acc_start, r_start); + return true; +} + //------------------------------inline_digestBase_implCompress----------------------- // // Calculate MD5 for single-block byte[] array. diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index 1b99f34726938..9355db8239c15 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -293,6 +293,7 @@ class LibraryCallKit : public GraphKit { bool inline_ghash_processBlocks(); bool inline_base64_encodeBlock(); bool inline_base64_decodeBlock(); + bool inline_poly1305_processBlocks(); bool inline_digestBase_implCompress(vmIntrinsics::ID id); bool inline_digestBase_implCompressMB(int predicate); bool inline_digestBase_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass, diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 68408e198c74d..2becd93527c37 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -1266,6 +1266,27 @@ const TypeFunc* OptoRuntime::base64_decodeBlock_Type() { return TypeFunc::make(domain, range); } +// Poly1305 processMultipleBlocks function +const TypeFunc* OptoRuntime::poly1305_processBlocks_Type() { + int argcnt = 5; + + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // input array + fields[argp++] = TypeInt::INT; // input offset + fields[argp++] = TypeInt::INT; // input length + fields[argp++] = TypePtr::NOTNULL; // accumulator array + fields[argp++] = TypePtr::NOTNULL; // r array + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { // create input type (domain) diff --git a/src/hotspot/share/opto/runtime.hpp b/src/hotspot/share/opto/runtime.hpp index 43e4cff52287f..1de8ffb18fb97 100644 --- a/src/hotspot/share/opto/runtime.hpp +++ b/src/hotspot/share/opto/runtime.hpp @@ -280,6 +280,7 @@ class OptoRuntime : public AllStatic { static const TypeFunc* ghash_processBlocks_Type(); static const TypeFunc* base64_encodeBlock_Type(); static const TypeFunc* base64_decodeBlock_Type(); + static const TypeFunc* poly1305_processBlocks_Type(); static const TypeFunc* updateBytesCRC32_Type(); static const TypeFunc* updateBytesCRC32C_Type(); diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp index ad4d805cd107c..f39f08f77d246 100644 --- a/src/hotspot/share/runtime/globals.hpp +++ b/src/hotspot/share/runtime/globals.hpp @@ -238,6 +238,9 @@ const int ObjectAlignmentInBytes = 8; product(bool, UseBASE64Intrinsics, false, \ "Use intrinsics for java.util.Base64") \ \ + product(bool, UsePolyIntrinsics, false, \ + "Use intrinsics for sun.security.util.math.intpoly") \ + \ product(size_t, LargePageSizeInBytes, 0, \ "Maximum large page size used (0 will use the default large " \ "page size for the environment as the maximum)") \ diff --git a/src/hotspot/share/runtime/stubRoutines.cpp b/src/hotspot/share/runtime/stubRoutines.cpp index 93927ad0f89cf..9418b758387d3 100644 --- a/src/hotspot/share/runtime/stubRoutines.cpp +++ b/src/hotspot/share/runtime/stubRoutines.cpp @@ -130,6 +130,7 @@ address StubRoutines::_galoisCounterMode_AESCrypt = NULL; address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_base64_encodeBlock = NULL; address StubRoutines::_base64_decodeBlock = NULL; +address StubRoutines::_poly1305_processBlocks = NULL; address StubRoutines::_md5_implCompress = NULL; address StubRoutines::_md5_implCompressMB = NULL; diff --git a/src/hotspot/share/runtime/stubRoutines.hpp b/src/hotspot/share/runtime/stubRoutines.hpp index 30f58519ea9e4..f4cec54aa7f3c 100644 --- a/src/hotspot/share/runtime/stubRoutines.hpp +++ b/src/hotspot/share/runtime/stubRoutines.hpp @@ -211,6 +211,7 @@ class StubRoutines: AllStatic { static address _ghash_processBlocks; static address _base64_encodeBlock; static address _base64_decodeBlock; + static address _poly1305_processBlocks; static address _md5_implCompress; static address _md5_implCompressMB; @@ -384,6 +385,7 @@ class StubRoutines: AllStatic { static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } static address electronicCodeBook_encryptAESCrypt() { return _electronicCodeBook_encryptAESCrypt; } static address electronicCodeBook_decryptAESCrypt() { return _electronicCodeBook_decryptAESCrypt; } + static address poly1305_processBlocks() { return _poly1305_processBlocks; } static address counterMode_AESCrypt() { return _counterMode_AESCrypt; } static address ghash_processBlocks() { return _ghash_processBlocks; } static address base64_encodeBlock() { return _base64_encodeBlock; } diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index f903916808253..ed904ea4d56e5 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -545,6 +545,7 @@ static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _base64_encodeBlock, address) \ static_field(StubRoutines, _base64_decodeBlock, address) \ + static_field(StubRoutines, _poly1305_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ static_field(StubRoutines, _crc32c_table_addr, address) \ diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index cd78df84bede0..4fdc169a34587 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -34,6 +34,8 @@ import sun.security.util.math.*; import sun.security.util.math.intpoly.*; +import jdk.internal.vm.annotation.IntrinsicCandidate; +import jdk.internal.vm.annotation.ForceInline; /** * This class represents the Poly1305 function defined in RFC 7539. @@ -165,11 +167,17 @@ void engineUpdate(byte[] input, int offset, int len) { blockOffset = 0; } } - while (len >= BLOCK_LENGTH) { - processBlock(input, offset, BLOCK_LENGTH); - offset += BLOCK_LENGTH; - len -= BLOCK_LENGTH; - } + + int blockMultipleLength = (len/BLOCK_LENGTH) * BLOCK_LENGTH; + byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); + byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); + + processMultipleBlocksCheck(input, offset, blockMultipleLength, aBytes, rBytes); + processMultipleBlocks(input, offset, blockMultipleLength, aBytes, rBytes); + this.a.setValue(aBytes, 0, aBytes.length, (byte) 0); + offset += blockMultipleLength; + len -= blockMultipleLength; + if (len > 0) { // and len < BLOCK_LENGTH System.arraycopy(input, offset, block, 0, len); blockOffset = len; @@ -235,12 +243,36 @@ private void processBlock(byte[] block, int offset, int length) { a.setProduct(r); // a = (a * r) % p } + // Emulate intrinsic, no access to class variables, but means extra conversions + @ForceInline + @IntrinsicCandidate + private static void processMultipleBlocks(byte[] input, int offset, int length, byte[] aBytes, byte[] rBytes) { + MutableIntegerModuloP A = ipl1305.getElement(aBytes).mutable(); + MutableIntegerModuloP R = ipl1305.getElement(rBytes).mutable(); + MutableIntegerModuloP temp = ipl1305.get0().mutable(); + while (length >= BLOCK_LENGTH) { + temp.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); + A.setSum(temp); // A += (temp | 0x01) + A.setProduct(R); // A = (A * R) % p + offset += BLOCK_LENGTH; + length -= BLOCK_LENGTH; + } + + A.asByteArray(aBytes); + } + + private static void processMultipleBlocksCheck(byte[] input, int offset, int length, byte[] a, byte[] r) { + Objects.checkFromIndexSize(offset, length, input.length); + Objects.checkFromIndexSize(0, BLOCK_LENGTH+1, a.length); + Objects.checkFromIndexSize(0, BLOCK_LENGTH, r.length); + } + /** * Partition the authentication key into the R and S components, clamp * the R value, and instantiate IntegerModuloP objects to R and S's * numeric values. */ - private void setRSVals() { + private void setRSVals() { //throws InvalidKeyException { // Clamp the bytes in the "r" half of the key. keyBytes[3] &= 15; keyBytes[7] &= 15; @@ -250,6 +282,22 @@ private void setRSVals() { keyBytes[8] &= (byte)252; keyBytes[12] &= (byte)252; + // byte keyIsZero = 0; + // for (int i = 0; i < RS_LENGTH; i++) { + // keyIsZero |= keyBytes[i]; + // } + // if (keyIsZero == 0) { + // throw new InvalidKeyException("R is set to zero"); + // } + + // keyIsZero = 0; + // for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { + // keyIsZero |= keyBytes[i]; + // } + // if (keyIsZero == 0) { + // throw new InvalidKeyException("S is set to zero"); + // } + // Create IntegerModuloP elements from the r and s values r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0); s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0); diff --git a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java index 13bd0748c238e..edef49ca8dcd1 100644 --- a/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java +++ b/src/jdk.internal.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java @@ -226,6 +226,7 @@ public enum CPUFeature implements CPUFeatureName { FSRM, GFNI, AVX512_BITALG, + AVX512_IFMA, } private final EnumSet features; diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index df80d7d9de81d..cee420a8e70c5 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -28,3 +28,17 @@ * @run main java.base/com.sun.crypto.provider.Poly1305UnitTest * @summary Unit test for com.sun.crypto.provider.Poly1305. */ + + /* + * @test + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest + * @summary Unit test for com.sun.crypto.provider.Poly1305. + */ + + /* + * @test + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305KAT + * @summary Unit test for com.sun.crypto.provider.Poly1305. + */ diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java new file mode 100644 index 0000000000000..686511bc891fc --- /dev/null +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package com.sun.crypto.provider; + +import java.nio.ByteBuffer; +import java.util.Arrays; + +import javax.crypto.spec.SecretKeySpec; + +// This test case relies on the fact that single-byte Poly1305.engineUpdate(byte) does not have an intrinsic +// In this way we can compare if the intrinsic and pure java produce same result +// This test case is NOT entirely deterministic, it uses a random seed for pseudo-random number generator +// If a failure occurs, hardcode the seed to make the test case deterministic +public class Poly1305IntrinsicFuzzTest { + public static void main(String[] args) throws Exception { + //Note: it might be useful to increase this number during development of Poly1305 intrinsics for other platforms + final int repeat = 100; + for (int i = 0; i < repeat; i++) { + run(); + } + } + + public static void run() throws Exception { + java.util.Random rnd = new java.util.Random(); + long seed = rnd.nextLong(); + rnd.setSeed(seed); + + byte[] key = new byte[32]; + rnd.nextBytes(key); + int msgLen = rnd.nextInt(128, 4096); // x86_64 intrinsic requires 256 bytes minimum + byte[] message = new byte[msgLen]; + + Poly1305 authenticator = new Poly1305(); + Poly1305 authenticatorSlow = new Poly1305(); + if (authenticator.engineGetMacLength() != 16) { + throw new RuntimeException("The length of Poly1305 MAC must be 16-bytes."); + } + + authenticator.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null); + authenticatorSlow.engineInit(new SecretKeySpec(key, 0, 32, "Poly1305"), null); + + if (rnd.nextBoolean()) { + // Prime just the buffer and/or accumulator (buffer can keep at most 16 bytes from previous engineUpdate) + int initDataLen = rnd.nextInt(8, 24); + authenticator.engineUpdate(message, 0, initDataLen); + slowUpdate(authenticatorSlow, message, 0, initDataLen); + } + + if (rnd.nextBoolean()) { + // Multiple calls to engineUpdate + authenticator.engineUpdate(message, 0, message.length); + slowUpdate(authenticatorSlow, message, 0, message.length); + } + + authenticator.engineUpdate(message, 0, message.length); + slowUpdate(authenticatorSlow, message, 0, message.length); + + byte[] tag = authenticator.engineDoFinal(); + byte[] tagSlow = authenticatorSlow.engineDoFinal(); + + if (!Arrays.equals(tag, tagSlow)) { + throw new RuntimeException("[Seed "+seed+"] Tag mismatch: " + Arrays.toString(tag) + " != " + Arrays.toString(tagSlow)); + } + } + + static void slowUpdate(Poly1305 authenticator, byte[] message, int offset, int len) { + len = Math.min(message.length, offset + len); + for (int i = offset; i < len; i++) { + authenticator.engineUpdate(message[i]); + } + } +} diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java new file mode 100644 index 0000000000000..b6236682ef81e --- /dev/null +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package com.sun.crypto.provider; + +import java.util.*; +import java.nio.ByteBuffer; +import java.util.Arrays; + +import javax.crypto.spec.SecretKeySpec; + +public class Poly1305KAT { + public static class TestData { + public TestData(String name, String keyStr, String inputStr, String outStr) { + HexFormat hex = HexFormat.of(); + testName = Objects.requireNonNull(name); + key = hex.parseHex(Objects.requireNonNull(keyStr)); + input = hex.parseHex(Objects.requireNonNull(inputStr)); + expOutput = hex.parseHex(Objects.requireNonNull(outStr)); + } + + public final String testName; + public final byte[] key; + public final byte[] input; + public final byte[] expOutput; + } + + public static final List testList = new LinkedList() {{ + add(new TestData("RFC 7539 A.3 Test Vector #1", + "0000000000000000000000000000000000000000000000000000000000000000", + "0000000000000000000000000000000000000000000000000000000000000000" + + "0000000000000000000000000000000000000000000000000000000000000000", + "00000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #2", + "0000000000000000000000000000000036e5f6b5c5e06070f0efca96227a863e", + "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" + + "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" + + "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" + + "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" + + "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" + + "206f6620616e204945544620616374697669747920697320636f6e7369646572" + + "656420616e20224945544620436f6e747269627574696f6e222e205375636820" + + "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" + + "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" + + "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" + + "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" + + "207768696368206172652061646472657373656420746f", + "36e5f6b5c5e06070f0efca96227a863e")); + add(new TestData("RFC 7539 A.3 Test Vector #3", + "36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000", + "416e79207375626d697373696f6e20746f20746865204945544620696e74656e" + + "6465642062792074686520436f6e7472696275746f7220666f72207075626c69" + + "636174696f6e20617320616c6c206f722070617274206f6620616e2049455446" + + "20496e7465726e65742d4472616674206f722052464320616e6420616e792073" + + "746174656d656e74206d6164652077697468696e2074686520636f6e74657874" + + "206f6620616e204945544620616374697669747920697320636f6e7369646572" + + "656420616e20224945544620436f6e747269627574696f6e222e205375636820" + + "73746174656d656e747320696e636c756465206f72616c2073746174656d656e" + + "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" + + "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" + + "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" + + "207768696368206172652061646472657373656420746f", + "f3477e7cd95417af89a6b8794c310cf0")); + add(new TestData("RFC 7539 A.3 Test Vector #4", + "1c9240a5eb55d38af333888604f6b5f0473917c1402b80099dca5cbc207075c0", + "2754776173206272696c6c69672c20616e642074686520736c6974687920746f" + + "7665730a446964206779726520616e642067696d626c6520696e207468652077" + + "6162653a0a416c6c206d696d737920776572652074686520626f726f676f7665" + + "732c0a416e6420746865206d6f6d65207261746873206f757467726162652e", + "4541669a7eaaee61e708dc7cbcc5eb62")); + add(new TestData("RFC 7539 A.3 Test Vector #5: If one uses 130-bit partial reduction, does the code handle the case where partially reducedfinal result is not fully reduced?", + "0200000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "03000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #6: What happens if addition of s overflows modulo 2^128?", + "02000000000000000000000000000000FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "02000000000000000000000000000000", + "03000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #7: What happens if data limb is all ones and there is carry from lower limb?", + "0100000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF0FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF" + + "11000000000000000000000000000000", + "05000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #8: What happens if final result from polynomial part is exactly 2^130-5?", + "0100000000000000000000000000000000000000000000000000000000000000", + "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFBFEFEFEFEFEFEFEFEFEFEFEFEFEFEFE" + + "01010101010101010101010101010101", + "00000000000000000000000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #9: What happens if final result from polynomial part is exactly 2^130-6?", + "0200000000000000000000000000000000000000000000000000000000000000", + "FDFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", + "FAFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF")); + add(new TestData("RFC 7539 A.3 Test Vector #10: What happens if 5*H+L-type reduction produces 131-bit intermediate result?", + "0100000000000000040000000000000000000000000000000000000000000000", + "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" + + "0000000000000000000000000000000001000000000000000000000000000000", + "14000000000000005500000000000000")); + add(new TestData("RFC 7539 A.3 Test Vector #11: What happens if 5*H+L-type reduction produces 131-bit final result?", + "0100000000000000040000000000000000000000000000000000000000000000", + "E33594D7505E43B900000000000000003394D7505E4379CD0100000000000000" + + "00000000000000000000000000000000", + "13000000000000000000000000000000")); + }}; + + public static void main(String args[]) throws Exception { + int testsPassed = 0; + int testNumber = 0; + + for (TestData test : testList) { + System.out.println("*** Test " + ++testNumber + ": " + + test.testName); + if (runSingleTest(test)) { + testsPassed++; + } + } + System.out.println(); + + if (testsPassed != testNumber) { + throw new RuntimeException("One or more tests failed. " + + "Check output for details"); + } + } + + private static boolean runSingleTest(TestData testData) throws Exception { + Poly1305 authenticator = new Poly1305(); + authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null); + authenticator.engineUpdate(testData.input, 0, testData.input.length); + byte[] tag = authenticator.engineDoFinal(); + if (!Arrays.equals(tag, testData.expOutput)) { + System.out.println("ERROR - Output Mismatch!"); + System.out.println("Expected:\n" + + dumpHexBytes(testData.expOutput, testData.expOutput.length, "\n", " ")); + System.out.println("Actual:\n" + + dumpHexBytes(tag, tag.length, "\n", " ")); + System.out.println(); + return false; + } + return true; + } + + /** + * Dump the hex bytes of a buffer into string form. + * + * @param data The array of bytes to dump to stdout. + * @param itemsPerLine The number of bytes to display per line + * if the {@code lineDelim} character is blank then all bytes + * will be printed on a single line. + * @param lineDelim The delimiter between lines + * @param itemDelim The delimiter between bytes + * + * @return The hexdump of the byte array + */ + private static String dumpHexBytes(byte[] data, int itemsPerLine, + String lineDelim, String itemDelim) { + return dumpHexBytes(ByteBuffer.wrap(data), itemsPerLine, lineDelim, + itemDelim); + } + + private static String dumpHexBytes(ByteBuffer data, int itemsPerLine, + String lineDelim, String itemDelim) { + StringBuilder sb = new StringBuilder(); + if (data != null) { + data.mark(); + int i = 0; + while (data.remaining() > 0) { + if (i % itemsPerLine == 0 && i != 0) { + sb.append(lineDelim); + } + sb.append(String.format("%02X", data.get())).append(itemDelim); + i++; + } + data.reset(); + } + + return sb.toString(); + } +} + diff --git a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java index fb4c9506c59ba..37838901eeaa4 100644 --- a/test/lib-test/jdk/test/whitebox/CPUInfoTest.java +++ b/test/lib-test/jdk/test/whitebox/CPUInfoTest.java @@ -63,7 +63,8 @@ public class CPUInfoTest { "vzeroupper", "avx512_vpopcntdq", "avx512_vpclmulqdq", "avx512_vaes", "avx512_vnni", "clflush", "clflushopt", "clwb", "avx512_vbmi2", "avx512_vbmi", "rdtscp", "rdpid", - "hv", "fsrm", "avx512_bitalg", "gfni" + "hv", "fsrm", "avx512_bitalg", "gfni", + "avx512_ifma" ); // @formatter:on // Checkstyle: resume From 507d6bf605317d1b45856c0450cb69882062986a Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 13 Oct 2022 14:01:46 -0400 Subject: [PATCH 02/23] - Fix whitespace and copyright statements - Add benchmark --- src/hotspot/cpu/x86/assembler_x86.cpp | 2 +- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 4 +- .../cpu/x86/macroAssembler_x86_poly.cpp | 110 +++++++++--------- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 2 +- .../com/sun/crypto/provider/Poly1305.java | 30 +++-- .../unittest/Poly1305UnitTestDriver.java | 2 +- .../provider/Poly1305IntrinsicFuzzTest.java | 5 +- .../com/sun/crypto/provider/Poly1305KAT.java | 5 +- .../crypto/full/Poly1305DigestBench.java | 90 ++++++++++++++ 9 files changed, 177 insertions(+), 73 deletions(-) create mode 100644 test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 46fba192d633c..d2a3d5ba15261 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -5020,7 +5020,7 @@ void Assembler::evpmadd52luq(XMMRegister dst, KRegister mask, XMMRegister src1, if (merge) { attributes.reset_is_clear_context(); } - + int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); emit_int16((unsigned char)0xB4, (0xC0 | encode)); } diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 13d7afe722d84..c1859ad74c322 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -968,14 +968,14 @@ class MacroAssembler: public Assembler { void addmq(int disp, Register r1, Register r2); - void poly1305_process_blocks_avx512(const Register input, const Register length, + void poly1305_process_blocks_avx512(const Register input, const Register length, const Register A0, const Register A1, const Register A2, const Register R0, const Register R1, const Register C1); void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP); void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, const Register R0, const Register R1, const Register C1, bool only128); - void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); public: diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp index ba191a5c27d8f..3167489f420ca 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp @@ -1,27 +1,27 @@ /* -* Copyright (c) 2022, Intel Corporation. All rights reserved. -* -* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -* -* This code is free software; you can redistribute it and/or modify it -* under the terms of the GNU General Public License version 2 only, as -* published by the Free Software Foundation. -* -* This code is distributed in the hope that it will be useful, but WITHOUT -* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -* version 2 for more details (a copy is included in the LICENSE file that -* accompanied this code). -* -* You should have received a copy of the GNU General Public License version -* 2 along with this work; if not, write to the Free Software Foundation, -* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. -* -* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA -* or visit www.oracle.com if you need additional information or have any -* questions. -* -*/ + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ #include "precompiled.hpp" #include "asm/assembler.hpp" @@ -39,7 +39,7 @@ // // Reduction by 2^130-5 can be expressed as follows: // ( a×2^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary -// = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5 +// = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5 // = ( a×(2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop // = ( 5×a + b ) mod 2^130-5 // QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add' @@ -86,7 +86,7 @@ enum polyCPOffset { mask_42 = 128, }; -// Compute product for 8 16-byte message blocks, +// Compute product for 8 16-byte message blocks, // i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0] // // Each block/number is represented by 3 44-bit limb digits, start with multiplication @@ -109,7 +109,7 @@ enum polyCPOffset { // = ( p2'×2^88 + p2''×2^130 - 5×p2'' + 5×p2'') mod 2^130-5 // = ( p2'×2^88 + p2''×(2^130 - 5) + 5×p2'') mod 2^130-5 // i.e. adding multiples of modulus is a noop // = ( p2'×2^88 + 5×p2'') mod 2^130-5 -// +// // Math Note 2: R1P = 4*5*R1 and R2P = 4*5*R2; This precomputation allows simultaneous reduction and multiplication. // This is not the standard 'multiply-upper-by-5', here is why the factor is 4*5 instead of 5. // For example, partial product (a2×r2): @@ -124,8 +124,8 @@ enum polyCPOffset { // = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2 // void MacroAssembler::poly1305_multiply8_avx512( - const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP) + const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP) { const XMMRegister P0_L = xmm0; const XMMRegister P0_H = xmm1; @@ -165,27 +165,27 @@ void MacroAssembler::poly1305_multiply8_avx512( evpmadd52luq(P2_L, A1, R1, Assembler::AVX_512bit); evpmadd52huq(P2_H, A1, R1, Assembler::AVX_512bit); - // Carry propagation: + // Carry propagation: // (Not quite aligned) | More mathematically correct: // P2_L P1_L P0_L | P2_L×2^88 + P1_L×2^44 + P0_L×2^0 - // + P2_H P1_H P0_H | + P2_H×2^140 + P1_H×2^96 + P0_H×2^52 + // + P2_H P1_H P0_H | + P2_H×2^140 + P1_H×2^96 + P0_H×2^52 // --------------------------- | ----------------------------------------------- - // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 + // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); vpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - + vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); vpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - + vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); vpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits - + vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); vpaddq(P2_H, P2_H, TMP1, Assembler::AVX_512bit); @@ -202,7 +202,7 @@ void MacroAssembler::poly1305_multiply8_avx512( // Compute product for a single 16-byte message blocks // - Assumes that r = [r1 r0] is only 128 bits (not 130) -// - When only128 is set, Input [a2 a1 a0] is 128 bits (i.e. a2==0) +// - When only128 is set, Input [a2 a1 a0] is 128 bits (i.e. a2==0) // - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used) // // Note 1: a2 here is only two bits so anything above is subject of reduction. @@ -230,8 +230,8 @@ void MacroAssembler::poly1305_multiply_scalar( const Register a0, const Register a1, const Register a2, const Register r0, const Register r1, const Register c1, bool only128) { - const Register t1 = r13; - const Register t2 = r14; + const Register t1 = r13; + const Register t2 = r14; const Register t3 = r15; // Note mulq instruction requires/clobers rax, rdx @@ -343,7 +343,7 @@ void MacroAssembler::poly1305_limbs_avx512( vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) - // Lowest 44-bit limbs of new blocks + // Lowest 44-bit limbs of new blocks vpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); } @@ -405,7 +405,7 @@ void MacroAssembler::poly1305_limbs_avx512( // B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9] // push(B) // R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] -// +// // for (;length>=16; input+=16, length-=16) // BL = limbs(input) // BH = limbs(input+8) @@ -413,7 +413,7 @@ void MacroAssembler::poly1305_limbs_avx512( // AH = AH × R // AL = AL + BL // AH = AH + BH -// +// // B = pop() // R = pop() // AL = AL × R @@ -426,7 +426,7 @@ void MacroAssembler::poly1305_limbs_avx512( // T = A >> 1 // 2 ->1 blocks // A = A + T // a = A -void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const Register length, +void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const Register length, const Register a0, const Register a1, const Register a2, const Register r0, const Register r1, const Register c1) { @@ -486,7 +486,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const andq(a1, Address(polyCP, mask_42)); // Third limb (Acc[129:88]) movq(C2, a1); - // To add accumulator, we must unroll first loop iteration + // To add accumulator, we must unroll first loop iteration // Load first block of data (128 bytes) and pad // A0 to have bits 0-43 of all 8 blocks in 8 qwords @@ -510,7 +510,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, polyCP); subl(length, 16*16); - lea(input, Address(input,16*16)); + lea(input, Address(input,16*16)); // Compute the powers of R^1..R^4 and form 44-bit limbs of each // T0 to have bits 0-127 in 4 quadword pairs @@ -520,7 +520,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const vpinsrq(T2, T2, r1, 1); vinserti32x4(T0, T0, T2, 3); - // Calculate R^2 + // Calculate R^2 movq(a0, r0); movq(a1, r1); // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator @@ -595,7 +595,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4 polyCP); - + // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R vporq(B0, B0, C0, Assembler::AVX_512bit); vporq(B1, B1, C1, Assembler::AVX_512bit); @@ -605,7 +605,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const vpbroadcastq(R0, B0, Assembler::AVX_512bit); vpbroadcastq(R1, B1, Assembler::AVX_512bit); vpbroadcastq(R2, B2, Assembler::AVX_512bit); - + // Generate 4*5*R^8 vpsllq(R1P, R1, 2, Assembler::AVX_512bit); vpsllq(R2P, R2, 2, Assembler::AVX_512bit); @@ -642,7 +642,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16 vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); - // VECTOR LOOP: process 16 * 16-byte message block at a time + // VECTOR LOOP: process 16 * 16-byte message block at a time bind(L_process256Loop); cmpl(length, 16*16); jcc(Assembler::less, L_process256LoopDone); @@ -672,7 +672,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator subl(length, 16*16); - lea(input, Address(input,16*16)); + lea(input, Address(input,16*16)); jmp(L_process256Loop); bind(L_process256LoopDone); @@ -686,7 +686,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const evmovdquq(R0, Address(rsp, 64*0), Assembler::AVX_512bit); evmovdquq(R1, Address(rsp, 64*1), Assembler::AVX_512bit); evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); - + // Generate 4*5*[R^16..R^9] (ignore lowest limb) vpsllq(T0, B1, 2, Assembler::AVX_512bit); vpaddq(B3, B1, T0, Assembler::AVX_512bit); // R1' (R1*5) @@ -710,9 +710,9 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const R0, R1, R2, R1P, R2P, // R^8-R, R1P, R2P polyCP); - // Add all blocks (horizontally) + // Add all blocks (horizontally) // 16->8 blocks - vpaddq(A0, A0, A3, Assembler::AVX_512bit); + vpaddq(A0, A0, A3, Assembler::AVX_512bit); vpaddq(A1, A1, A4, Assembler::AVX_512bit); vpaddq(A2, A2, A5, Assembler::AVX_512bit); @@ -810,7 +810,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const // This function consumes as many whole 16-byte blocks as available in input // After execution, input and length will point at remaining (unprocessed) data // and accumulator will point to the current accumulator value -// +// void MacroAssembler::poly1305_process_blocks(Register input, Register length, Register accumulator, Register R) { // Register Map: @@ -841,16 +841,16 @@ void MacroAssembler::poly1305_process_blocks(Register input, Register length, Re movq(a0, Address(accumulator, 0)); movq(a1, Address(accumulator, 8)); movzbq(a2, Address(accumulator, 16)); - + // VECTOR LOOP: Minimum of 256 bytes to run vectorized code cmpl(length, 16*16); jcc(Assembler::less, L_process16Loop); - poly1305_process_blocks_avx512(input, length, + poly1305_process_blocks_avx512(input, length, a0, a1, a2, r0, r1, c1); - // SCALAR LOOP: process one 16-byte message block at a time + // SCALAR LOOP: process one 16-byte message block at a time bind(L_process16Loop); cmpl(length, 16); jcc(Assembler::less, L_process16LoopDone); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 6a2256edbe0e7..aa3cfc14baf81 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -1996,7 +1996,7 @@ address StubGenerator::generate_poly1305_processBlocks() { StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); address start = __ pc(); __ enter(); - + // Save all 'SOE' registers __ push(rbx); #ifdef _WIN64 diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 4fdc169a34587..ee5f96318d823 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -168,15 +168,26 @@ void engineUpdate(byte[] input, int offset, int len) { } } - int blockMultipleLength = (len/BLOCK_LENGTH) * BLOCK_LENGTH; - byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); - byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); - - processMultipleBlocksCheck(input, offset, blockMultipleLength, aBytes, rBytes); - processMultipleBlocks(input, offset, blockMultipleLength, aBytes, rBytes); - this.a.setValue(aBytes, 0, aBytes.length, (byte) 0); - offset += blockMultipleLength; - len -= blockMultipleLength; + if (len >= 1024) { + // Intrinsic code; need to extract a and r into bytes + // Choice of 1024 is arbitrary, need enough data blocks to amortize conversion overhead + // and not affect platforms without intrinsic support + int blockMultipleLength = (len/BLOCK_LENGTH) * BLOCK_LENGTH; + byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); + byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); + + processMultipleBlocksCheck(input, offset, blockMultipleLength, aBytes, rBytes); + processMultipleBlocks(input, offset, blockMultipleLength, aBytes, rBytes); + this.a.setValue(aBytes, 0, aBytes.length, (byte) 0); + offset += blockMultipleLength; + len -= blockMultipleLength; + } else { + while (len >= BLOCK_LENGTH) { + processBlock(input, offset, BLOCK_LENGTH); + offset += BLOCK_LENGTH; + len -= BLOCK_LENGTH; + } + } if (len > 0) { // and len < BLOCK_LENGTH System.arraycopy(input, offset, block, 0, len); @@ -282,6 +293,7 @@ private void setRSVals() { //throws InvalidKeyException { keyBytes[8] &= (byte)252; keyBytes[12] &= (byte)252; + // This should be enabled, but Poly1305KAT would fail // byte keyIsZero = 0; // for (int i = 0; i < RS_LENGTH; i++) { // keyIsZero |= keyBytes[i]; diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index cee420a8e70c5..210bc69b49411 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java index 686511bc891fc..aae840b011af6 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java @@ -1,5 +1,6 @@ /* - * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,7 +35,7 @@ // If a failure occurs, hardcode the seed to make the test case deterministic public class Poly1305IntrinsicFuzzTest { public static void main(String[] args) throws Exception { - //Note: it might be useful to increase this number during development of Poly1305 intrinsics for other platforms + //Note: it might be useful to increase this number during development of new Poly1305 intrinsics final int repeat = 100; for (int i = 0; i < repeat; i++) { run(); diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java index b6236682ef81e..f9429ec637d44 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java @@ -1,5 +1,6 @@ /* - * Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -64,7 +65,7 @@ public TestData(String name, String keyStr, String inputStr, String outStr) { "747320696e20494554462073657373696f6e732c2061732077656c6c20617320" + "7772697474656e20616e6420656c656374726f6e696320636f6d6d756e696361" + "74696f6e73206d61646520617420616e792074696d65206f7220706c6163652c" + - "207768696368206172652061646472657373656420746f", + "207768696368206172652061646472657373656420746f", "36e5f6b5c5e06070f0efca96227a863e")); add(new TestData("RFC 7539 A.3 Test Vector #3", "36e5f6b5c5e06070f0efca96227a863e00000000000000000000000000000000", diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java new file mode 100644 index 0000000000000..f381ed4875687 --- /dev/null +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.javax.crypto.full; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Setup; + +import java.lang.invoke.MethodHandle; +import java.lang.invoke.MethodHandles; +import java.lang.reflect.Method; +import java.lang.reflect.Constructor; +import java.security.Key; +import java.security.spec.AlgorithmParameterSpec; +import javax.crypto.spec.SecretKeySpec; + +public class Poly1305DigestBench extends CryptoBase { + public static final int SET_SIZE = 128; + + @Param({"64", "256", "1024", "" + 16*1024, "" + 1024*1024}) + int dataSize; + + private byte[][] data; + int index = 0; + private static MethodHandle polyEngineInit, polyEngineUpdate, polyEngineFinal; + private static Object polyObj; + + static { + try { + MethodHandles.Lookup lookup = MethodHandles.lookup(); + Class polyClazz = Class.forName("com.sun.crypto.provider.Poly1305"); + Constructor constructor = polyClazz.getDeclaredConstructor(); + constructor.setAccessible(true); + polyObj = constructor.newInstance(); + + Method m = polyClazz.getDeclaredMethod("engineInit", Key.class, AlgorithmParameterSpec.class); + m.setAccessible(true); + polyEngineInit = lookup.unreflect(m); + + m = polyClazz.getDeclaredMethod("engineUpdate", byte[].class, int.class, int.class); + m.setAccessible(true); + polyEngineUpdate = lookup.unreflect(m); + + m = polyClazz.getDeclaredMethod("engineDoFinal"); + m.setAccessible(true); + polyEngineFinal = lookup.unreflect(m); + } catch (Throwable ex) { + throw new RuntimeException(ex); + } + } + + @Setup + public void setup() { + setupProvider(); + data = fillRandom(new byte[SET_SIZE][dataSize]); + } + + @Benchmark + public byte[] digest() { + try { + byte[] d = data[index]; + index = (index +1) % SET_SIZE; + polyEngineInit.invoke(polyObj, new SecretKeySpec(d, 0, 32, "Poly1305"), null); + polyEngineUpdate.invoke(polyObj, d, 0, d.length); + return (byte[])polyEngineFinal.invoke(polyObj); + } catch (Throwable ex) { + throw new RuntimeException(ex); + } + } +} From 7e070d9e677985318a68b2269ca09e0ab4751c07 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 13 Oct 2022 15:40:10 -0400 Subject: [PATCH 03/23] missed white-space fix --- .../share/classes/com/sun/crypto/provider/Poly1305.java | 2 +- .../openjdk/bench/javax/crypto/full/Poly1305DigestBench.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index ee5f96318d823..536499c16d8e6 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -175,7 +175,7 @@ void engineUpdate(byte[] input, int offset, int len) { int blockMultipleLength = (len/BLOCK_LENGTH) * BLOCK_LENGTH; byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); - + processMultipleBlocksCheck(input, offset, blockMultipleLength, aBytes, rBytes); processMultipleBlocks(input, offset, blockMultipleLength, aBytes, rBytes); this.a.setValue(aBytes, 0, aBytes.length, (byte) 0); diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java index f381ed4875687..f674bc80c93e9 100644 --- a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -1,5 +1,6 @@ /* - * Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2022, Intel Corporation. All rights reserved. + * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From f048f9380c56ad13914863b95f21427a494f2245 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Fri, 21 Oct 2022 16:16:54 -0400 Subject: [PATCH 04/23] further restrict UsePolyIntrinsics with supports_avx512vlbw --- src/hotspot/cpu/x86/vm_version_x86.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 4dd7a1a09fe93..06825b2f76ccc 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1175,7 +1175,7 @@ void VM_Version::get_processor_features() { } #ifdef _LP64 - if (supports_avx512ifma()) { + if (supports_avx512ifma() & supports_avx512vlbw()) { if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { FLAG_SET_DEFAULT(UsePolyIntrinsics, true); } From de7e138b1e728d6a67bbfbd85117c9c17a282685 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Mon, 24 Oct 2022 17:58:28 -0400 Subject: [PATCH 05/23] assembler checks and test case fixes --- src/hotspot/cpu/x86/assembler_x86.cpp | 56 ++++++++----------- src/hotspot/cpu/x86/assembler_x86.hpp | 8 +-- src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp | 8 +-- .../cpu/x86/macroAssembler_x86_poly.cpp | 26 ++++----- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 5 +- .../com/sun/crypto/provider/Poly1305.java | 2 +- .../unittest/Poly1305UnitTestDriver.java | 28 +++++++++- .../crypto/full/Poly1305DigestBench.java | 4 ++ 8 files changed, 77 insertions(+), 60 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 1ef1d20685bd7..f920f23528d1b 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -5464,7 +5464,8 @@ void Assembler::evpunpcklqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2 } void Assembler::evpunpcklqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { - assert(UseAVX > 2, "requires AVX512F"); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -5481,7 +5482,8 @@ void Assembler::evpunpckhqdq(XMMRegister dst, XMMRegister src1, XMMRegister src2 } void Assembler::evpunpckhqdq(XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { - assert(UseAVX > 2, "requires AVX512F"); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -7820,21 +7822,12 @@ void Assembler::vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_ emit_operand(dst, src, 0); } -void Assembler::vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xDB, (0xC0 | encode)); +void Assembler::evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + evpandq(dst, k0, nds, src, false, vector_len); } -void Assembler::vpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); - vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int8((unsigned char)0xDB); - emit_operand(dst, src, 0); +void Assembler::evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + evpandq(dst, k0, nds, src, false, vector_len); } //Variable Shift packed integers logically left. @@ -7947,21 +7940,12 @@ void Assembler::vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_l emit_operand(dst, src, 0); } -void Assembler::vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int16((unsigned char)0xEB, (0xC0 | encode)); +void Assembler::evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { + evporq(dst, k0, nds, src, false, vector_len); } -void Assembler::vporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { - assert(VM_Version::supports_evex(), ""); - InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); - attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); - vex_prefix(src, nds->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes); - emit_int8((unsigned char)0xEB); - emit_operand(dst, src, 0); +void Assembler::evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { + evporq(dst, k0, nds, src, false, vector_len); } void Assembler::evpord(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { @@ -8103,7 +8087,8 @@ void Assembler::evpandd(XMMRegister dst, KRegister mask, XMMRegister nds, Addres } void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { - assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -8115,7 +8100,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMReg } void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { - assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit); @@ -8130,7 +8116,8 @@ void Assembler::evpandq(XMMRegister dst, KRegister mask, XMMRegister nds, Addres } void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len) { - assert(VM_Version::supports_evex(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_embedded_opmask_register_specifier(mask); @@ -8142,7 +8129,8 @@ void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegi } void Assembler::evporq(XMMRegister dst, KRegister mask, XMMRegister nds, Address src, bool merge, int vector_len) { - assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), ""); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionMark im(this); InstructionAttr attributes(vector_len, /* vex_w */ true,/* legacy_mode */ false, /* no_mask_reg */ false,/* uses_vl */ true); attributes.set_address_attributes(/* tuple_type */ EVEX_FV,/* input_size_in_bits */ EVEX_32bit); @@ -8300,8 +8288,8 @@ void Assembler::vpternlogd(XMMRegister dst, int imm8, XMMRegister src2, Address } void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, XMMRegister src3, int vector_len) { - assert(VM_Version::supports_evex(), "requires EVEX support"); - assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); + assert(VM_Version::supports_evex(), "requires AVX512F"); + assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires AVX512VL"); InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src3->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index c9a30633fe553..297810a1fb68d 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -2629,8 +2629,8 @@ class Assembler : public AbstractAssembler { void pand(XMMRegister dst, XMMRegister src); void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len); - void vpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void evpandq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evpandq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Andn packed integers void pandn(XMMRegister dst, XMMRegister src); @@ -2640,8 +2640,8 @@ class Assembler : public AbstractAssembler { void por(XMMRegister dst, XMMRegister src); void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len); - void vporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); - void vporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); + void evporq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); + void evporq(XMMRegister dst, XMMRegister nds, Address src, int vector_len); // Xor packed integers void pxor(XMMRegister dst, XMMRegister src); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index 2154e867d929b..c3a001d4bac2a 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -5279,7 +5279,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg // Get the reverse bit sequence of lower nibble of each byte. vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg); vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); - vpandq(dst, xtmp2, src, vec_enc); + evpandq(dst, xtmp2, src, vec_enc); vpshufb(dst, xtmp1, dst, vec_enc); vpsllq(dst, dst, 4, vec_enc); @@ -5290,7 +5290,7 @@ void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMReg // Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte. - vporq(xtmp2, dst, xtmp2, vec_enc); + evporq(xtmp2, dst, xtmp2, vec_enc); vector_reverse_byte(bt, dst, xtmp2, vec_enc); } else if(vec_enc == Assembler::AVX_512bit) { @@ -5345,11 +5345,11 @@ void C2_MacroAssembler::vector_reverse_bit_gfni(BasicType bt, XMMRegister dst, X void C2_MacroAssembler::vector_swap_nbits(int nbits, int bitmask, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, Register rtmp, int vec_enc) { vbroadcast(T_INT, xtmp1, bitmask, rtmp, vec_enc); - vpandq(dst, xtmp1, src, vec_enc); + evpandq(dst, xtmp1, src, vec_enc); vpsllq(dst, dst, nbits, vec_enc); vpandn(xtmp1, xtmp1, src, vec_enc); vpsrlq(xtmp1, xtmp1, nbits, vec_enc); - vporq(dst, dst, xtmp1, vec_enc); + evporq(dst, dst, xtmp1, vec_enc); } void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp index 3167489f420ca..a1cfd376a474b 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp @@ -173,18 +173,18 @@ void MacroAssembler::poly1305_multiply8_avx512( // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); - vpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + evpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); - vpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + evpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); - vpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + evpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); @@ -196,7 +196,7 @@ void MacroAssembler::poly1305_multiply8_avx512( vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); - vpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); + evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); } @@ -335,7 +335,7 @@ void MacroAssembler::poly1305_limbs_avx512( // Highest 42-bit limbs of new blocks vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); if (padMSG) { - vporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message + evporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message } // Middle 44-bit limbs of new blocks @@ -344,7 +344,7 @@ void MacroAssembler::poly1305_limbs_avx512( vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) // Lowest 44-bit limbs of new blocks - vpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); + evpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); } // This function consumes as many whole 16*16-byte blocks as available in input @@ -560,7 +560,7 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const // T1 contains the 2 highest bits of the powers of R vpsllq(T1, T1, 40, Assembler::AVX_512bit); - vporq(B2, B2, T1, Assembler::AVX_512bit); + evporq(B2, B2, T1, Assembler::AVX_512bit); // Broadcast 44-bit limbs of R^4 into R0,R1,R2 mov(t0, a0); @@ -597,9 +597,9 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const polyCP); // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R - vporq(B0, B0, C0, Assembler::AVX_512bit); - vporq(B1, B1, C1, Assembler::AVX_512bit); - vporq(B2, B2, C2, Assembler::AVX_512bit); + evporq(B0, B0, C0, Assembler::AVX_512bit); + evporq(B1, B1, C1, Assembler::AVX_512bit); + evporq(B2, B2, C2, Assembler::AVX_512bit); // Broadcast R^8 vpbroadcastq(R0, B0, Assembler::AVX_512bit); @@ -746,13 +746,13 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const // Carry propagation vpsrlq(T0, A0, 44, Assembler::AVX_512bit); - vpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits vpaddq(A1, A1, T0, Assembler::AVX_512bit); vpsrlq(T0, A1, 44, Assembler::AVX_512bit); - vpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + evpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits vpaddq(A2, A2, T0, Assembler::AVX_512bit); vpsrlq(T0, A2, 42, Assembler::AVX_512bit); - vpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + evpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits vpsllq(T1, T0, 2, Assembler::AVX_512bit); vpaddq(T0, T0, T1, Assembler::AVX_512bit); vpaddq(A0, A0, T0, Assembler::AVX_512bit); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index ca4e4f86017fc..f4b27fa43a37c 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -2000,6 +2000,7 @@ address StubGenerator::generate_base64_encodeBlock() } address StubGenerator::generate_poly1305_masksCP() { + __ align64(); StubCodeMark mark(this, "StubRoutines", "generate_poly1305_masksCP"); address start = __ pc(); // OFFSET 0: high_bit @@ -2036,7 +2037,7 @@ address StubGenerator::generate_poly1305_masksCP() { } address StubGenerator::generate_poly1305_processBlocks() { - __ align(CodeEntryAlignment); + __ align64(); StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); address start = __ pc(); __ enter(); @@ -2603,7 +2604,7 @@ address StubGenerator::generate_base64_decodeBlock() { // Decode all bytes within our merged input __ evmovdquq(tmp, lookup_lo, Assembler::AVX_512bit); __ evpermt2b(tmp, input_initial_valid_b64, lookup_hi, Assembler::AVX_512bit); - __ vporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit); + __ evporq(mask, tmp, input_initial_valid_b64, Assembler::AVX_512bit); // Check for error. Compare (decoded | initial) to all invalid. // If any bytes have their high-order bit set, then we have an error. diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 536499c16d8e6..6bd0937817fcb 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -259,7 +259,7 @@ private void processBlock(byte[] block, int offset, int length) { @IntrinsicCandidate private static void processMultipleBlocks(byte[] input, int offset, int length, byte[] aBytes, byte[] rBytes) { MutableIntegerModuloP A = ipl1305.getElement(aBytes).mutable(); - MutableIntegerModuloP R = ipl1305.getElement(rBytes).mutable(); + IntegerModuloP R = ipl1305.getElement(rBytes).mutable(); MutableIntegerModuloP temp = ipl1305.get0().mutable(); while (length >= BLOCK_LENGTH) { temp.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index 210bc69b49411..4e365e7cf4a25 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -29,16 +29,40 @@ * @summary Unit test for com.sun.crypto.provider.Poly1305. */ - /* +/* * @test * @modules java.base/com.sun.crypto.provider * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest * @summary Unit test for com.sun.crypto.provider.Poly1305. */ - /* +/* * @test * @modules java.base/com.sun.crypto.provider * @run main java.base/com.sun.crypto.provider.Poly1305KAT * @summary Unit test for com.sun.crypto.provider.Poly1305. */ + +/* + * @test + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest + * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. + * @run main/othervm -Xcomp -XX:-TieredCompilation com.sun.crypto.provider.Cipher.ChaCha20.Poly1305UnitTestDriver + */ + +/* + * @test + * @modules java.base/com.sun.crypto.provider + * @run main java.base/com.sun.crypto.provider.Poly1305KAT + * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. + * @run main/othervm -Xcomp -XX:-TieredCompilation com.sun.crypto.provider.Cipher.ChaCha20.Poly1305UnitTestDriver + */ + +package com.sun.crypto.provider.Cipher.ChaCha20; + +public class Poly1305UnitTestDriver { + static public void main(String[] args) { + System.out.println("Passed"); + } +} diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java index f674bc80c93e9..ce9b8ac02d79e 100644 --- a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -34,7 +34,11 @@ import java.security.Key; import java.security.spec.AlgorithmParameterSpec; import javax.crypto.spec.SecretKeySpec; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Warmup; +@Warmup(time=3, iterations=5) // Reflection needs time to be upgraded to bytecodes +@Fork(value = 1, jvmArgsAppend = {"--add-opens", "java.base/com.sun.crypto.provider=ALL-UNNAMED"}) public class Poly1305DigestBench extends CryptoBase { public static final int SET_SIZE = 128; From 883be106db4d9f0508c36d0b0b15b5dc7c96be60 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Mon, 24 Oct 2022 18:05:11 -0400 Subject: [PATCH 06/23] extra whitespace character --- .../Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index 4e365e7cf4a25..b8bc22c244af5 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -47,7 +47,7 @@ * @test * @modules java.base/com.sun.crypto.provider * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest - * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. + * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. * @run main/othervm -Xcomp -XX:-TieredCompilation com.sun.crypto.provider.Cipher.ChaCha20.Poly1305UnitTestDriver */ From 78fd8fd748b604e762199a576709e7fcc8059e1d Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Fri, 28 Oct 2022 16:33:27 -0400 Subject: [PATCH 07/23] invalidkeyexception and some review comments --- src/hotspot/cpu/x86/assembler_x86.cpp | 2 +- .../cpu/x86/macroAssembler_x86_poly.cpp | 12 +++--- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 4 +- src/hotspot/cpu/x86/stubRoutines_x86.cpp | 2 +- src/hotspot/cpu/x86/vm_version_x86.cpp | 1 + .../com/sun/crypto/provider/Poly1305.java | 38 ++++++++++--------- .../com/sun/crypto/provider/Poly1305KAT.java | 1 + 7 files changed, 32 insertions(+), 28 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index f920f23528d1b..8fb7960453986 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -8303,7 +8303,7 @@ void Assembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, Address assert(vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl(), "requires VL support"); assert(dst != xnoreg, "sanity"); InstructionMark im(this); - InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true); + InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); attributes.set_is_evex_instruction(); attributes.set_address_attributes(/* tuple_type */ EVEX_FV, /* input_size_in_bits */ EVEX_64bit); vex_prefix(src3, src2->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp index a1cfd376a474b..248b95ac40276 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp @@ -798,12 +798,12 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const vpxorq(R2, R2, R2, Assembler::AVX_512bit); vpxorq(R1P, R1P, R1P, Assembler::AVX_512bit); vpxorq(R2P, R2P, R2P, Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*3), Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*4), Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*5), Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*0), Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*1), Assembler::AVX_512bit); - evmovdquq(A0, Address(rsp, 64*2), Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*3), A0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*4), A0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*5), A0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*0), A0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*1), A0, Assembler::AVX_512bit); + evmovdquq(Address(rsp, 64*2), A0, Assembler::AVX_512bit); addq(rsp, 512/8*6); // (powers of R) } diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index f4b27fa43a37c..223a290b45bf1 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -2000,7 +2000,7 @@ address StubGenerator::generate_base64_encodeBlock() } address StubGenerator::generate_poly1305_masksCP() { - __ align64(); + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "generate_poly1305_masksCP"); address start = __ pc(); // OFFSET 0: high_bit @@ -2037,7 +2037,7 @@ address StubGenerator::generate_poly1305_masksCP() { } address StubGenerator::generate_poly1305_processBlocks() { - __ align64(); + __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); address start = __ pc(); __ enter(); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index f1c14eda6fcf4..e72d86010585f 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 390481d3a2922..f54c9c3b2063b 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -954,6 +954,7 @@ void VM_Version::get_processor_features() { _features &= ~CPU_FLUSHOPT; _features &= ~CPU_GFNI; _features &= ~CPU_AVX512_BITALG; + _features &= ~CPU_AVX512_IFMA; } } diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 6bd0937817fcb..a8e834528061f 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -172,7 +172,7 @@ void engineUpdate(byte[] input, int offset, int len) { // Intrinsic code; need to extract a and r into bytes // Choice of 1024 is arbitrary, need enough data blocks to amortize conversion overhead // and not affect platforms without intrinsic support - int blockMultipleLength = (len/BLOCK_LENGTH) * BLOCK_LENGTH; + int blockMultipleLength = len & (~(BLOCK_LENGTH-1)); byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); @@ -283,7 +283,7 @@ private static void processMultipleBlocksCheck(byte[] input, int offset, int len * the R value, and instantiate IntegerModuloP objects to R and S's * numeric values. */ - private void setRSVals() { //throws InvalidKeyException { + private void setRSVals() throws InvalidKeyException { // Clamp the bytes in the "r" half of the key. keyBytes[3] &= 15; keyBytes[7] &= 15; @@ -293,25 +293,27 @@ private void setRSVals() { //throws InvalidKeyException { keyBytes[8] &= (byte)252; keyBytes[12] &= (byte)252; - // This should be enabled, but Poly1305KAT would fail - // byte keyIsZero = 0; - // for (int i = 0; i < RS_LENGTH; i++) { - // keyIsZero |= keyBytes[i]; - // } - // if (keyIsZero == 0) { - // throw new InvalidKeyException("R is set to zero"); - // } - - // keyIsZero = 0; - // for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { - // keyIsZero |= keyBytes[i]; - // } - // if (keyIsZero == 0) { - // throw new InvalidKeyException("S is set to zero"); - // } + byte keyIsZero = 0; + for (int i = 0; i < RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0 && !katTesting) { + throw new InvalidKeyException("R is set to zero"); + } + + keyIsZero = 0; + for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0 && !katTesting) { + throw new InvalidKeyException("S is set to zero"); + } // Create IntegerModuloP elements from the r and s values r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0); s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0); } + + // KAT testing expects R and/or S to be set to 0 for some tests + static boolean katTesting = false; } diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java index f9429ec637d44..e5046cbef1d3b 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java @@ -144,6 +144,7 @@ public static void main(String args[]) throws Exception { private static boolean runSingleTest(TestData testData) throws Exception { Poly1305 authenticator = new Poly1305(); + authenticator.katTesting = true; authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null); authenticator.engineUpdate(testData.input, 0, testData.input.length); byte[] tag = authenticator.engineDoFinal(); From 977e0272f3a7742b08ec68e9ab1d9c5f5a45ce32 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 3 Nov 2022 23:15:00 -0400 Subject: [PATCH 08/23] address Jamil's review --- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 2 + .../cpu/x86/macroAssembler_x86_poly.cpp | 116 ++++++++++++++++-- src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 17 +-- src/hotspot/share/classfile/vmIntrinsics.hpp | 3 +- src/hotspot/share/opto/library_call.cpp | 62 ++++++++-- src/hotspot/share/opto/runtime.cpp | 3 +- .../com/sun/crypto/provider/Poly1305.java | 83 +++++-------- .../security/util/math/IntegerModuloP.java | 5 + .../util/math/intpoly/IntegerPolynomial.java | 6 + .../com/sun/crypto/provider/Poly1305KAT.java | 3 +- .../crypto/full/Poly1305DigestBench.java | 4 +- 11 files changed, 215 insertions(+), 89 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index f081550093632..6390b3d9dcf7a 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -977,6 +977,8 @@ class MacroAssembler: public Assembler { const Register R0, const Register R1, const Register C1, bool only128); void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128); + void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs); public: void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp index 248b95ac40276..00d51bc25e797 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp @@ -347,6 +347,107 @@ void MacroAssembler::poly1305_limbs_avx512( evpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); } +/** + * Copy 5×26-bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3×64-bit limbs) + * + * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) + */ +void MacroAssembler::poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128) +{ + const Register t1 = r13; + const Register t2 = r14; + + movq(a0, Address(limbs, 0)); + movq(t1, Address(limbs, 8)); + shlq(t1, 26); + addq(a0, t1); + movq(t1, Address(limbs, 16)); + movq(t2, Address(limbs, 24)); + movq(a1, t1); + shlq(t1, 52); + shrq(a1, 12); + shlq(t2, 14); + addq(a0, t1); + adcq(a1, t2); + movq(t1, Address(limbs, 32)); + if (!only128) { + movq(a2, t1); + shrq(a2, 24); + } + shlq(t1, 40); + addq(a1, t1); + if (only128) { + return; + } + adcq(a2, 0); + + // One round of reduction + // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 + movq(t1, a2); + andq(t1, ~3); + andq(a2, 3); + movq(t2, t1); + shrq(t2, 2); + addq(t1, t2); + + addq(a0, t1); + adcq(a1, 0); + adcq(a2, 0); +} + + +/** + * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs` + */ +void MacroAssembler::poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs) +{ + const Register t1 = r13; + const Register t2 = r14; + + // Extra round of reduction + // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 + movq(t1, a2); + andq(t1, ~3); + andq(a2, 3); + movq(t2, t1); + shrq(t2, 2); + addq(t1, t2); + + addq(a0, t1); + adcq(a1, 0); + adcq(a2, 0); + + // Chop a2:a1:a0 into 26-bit limbs + movl(t1, a0); + andl(t1, 0x3ffffff); + movq(Address(limbs, 0), t1); + + shrq(a0, 26); + movl(t1, a0); + andl(t1, 0x3ffffff); + movq(Address(limbs, 8), t1); + + shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1 + movl(t1, a1); + shll(t1, 12); + addl(t1, a0); + andl(t1, 0x3ffffff); + movq(Address(limbs, 16), t1); + + shrq(a1, 14); // already used up 14 bits + shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced + addq(a1, a2); // put remaining bits into a1 + + movl(t1, a1); + andl(t1, 0x3ffffff); + movq(Address(limbs, 24), t1); + + shrq(a1, 26); + movl(t1, a1); + //andl(t1, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s) + movq(Address(limbs, 32), t1); +} + // This function consumes as many whole 16*16-byte blocks as available in input // After execution, input and length will point at remaining (unprocessed) data // and [a2 a1 a0] will contain the current accumulator value @@ -828,19 +929,16 @@ void MacroAssembler::poly1305_process_blocks(Register input, Register length, Re Label L_process16Loop, L_process16LoopDone; - // Load R - movq(r0, Address(R, 0)); - movq(r1, Address(R, 8)); + // Load R into r1:r0 + poly1305_limbs(R, r0, r1, r1, true); // Compute 5*R (Upper limb only) movq(c1, r1); shrq(c1, 2); addq(c1, r1); // c1 = r1 + (r1 >> 2) - // Load accumulator - movq(a0, Address(accumulator, 0)); - movq(a1, Address(accumulator, 8)); - movzbq(a2, Address(accumulator, 16)); + // Load accumulator into a2:a1:a0 + poly1305_limbs(accumulator, a0, a1, a2, false); // VECTOR LOOP: Minimum of 256 bytes to run vectorized code cmpl(length, 16*16); @@ -866,9 +964,7 @@ void MacroAssembler::poly1305_process_blocks(Register input, Register length, Re bind(L_process16LoopDone); // Write output - movq(Address(accumulator, 0), a0); - movq(Address(accumulator, 8), a1); - movb(Address(accumulator, 16), a2); + poly1305_limbs_out(a0, a1, a2, accumulator); } #endif // _LP64 diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 223a290b45bf1..8e7259f2eb887 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -2053,18 +2053,21 @@ address StubGenerator::generate_poly1305_processBlocks() { __ push(r14); __ push(r15); - // Normalize input - // JAVA: void processBlocks(byte[] input, int offset, int len, byte[] a, byte[] r) + // void processBlocks(byte[] input, int len, int[5] a, int[5] r) const Register input = rdi; //input+offset const Register length = rbx; const Register accumulator = rcx; const Register R = r8; - __ lea(input, Address(c_rarg0, c_rarg1)); - __ mov(length, c_rarg2); - #ifdef _WIN64 // R and acc already in correct position for linux - __ mov(accumulator, r9); // arg#3 - acc - __ movptr(R, Address(rbp, 6 * wordSize)); // arg#4 - R + #ifdef _WIN64 + __ mov(input, c_rarg0); + __ mov(length, c_rarg1); + __ mov(accumulator, c_rarg2); + __ mov(R, c_rarg3); + #else // input already in correct position for linux; dont clobber R, args copied out-of-order + __ mov(length, c_rarg1); + __ mov(R, c_rarg3); + __ mov(accumulator, c_rarg2); #endif __ poly1305_process_blocks(input, length, accumulator, R); diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 3406de22b7fa9..a862f01b118cd 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -523,9 +523,8 @@ class methodHandle; \ /* support for com.sun.crypto.provider.Poly1305 */ \ do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \ - do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, polyBlock_signature, F_S) \ + do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, putCharStringU_signature, F_R) \ do_name(processMultipleBlocks_name, "processMultipleBlocks") \ - do_signature(polyBlock_signature, "([BII[B[B)V") \ \ /* support for com.sun.crypto.provider.GHASH */ \ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 62b724c0178ef..a8281a4528422 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -6973,32 +6973,70 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { address stubAddr; const char *stubName; assert(UsePolyIntrinsics, "need Poly intrinsics support"); - assert(callee()->signature()->size() == 5, "poly1305_processBlocks has 5 parameters"); + assert(callee()->signature()->size() == 3, "poly1305_processBlocks has %d parameters", callee()->signature()->size()); stubAddr = StubRoutines::poly1305_processBlocks(); stubName = "poly1305_processBlocks"; if (!stubAddr) return false; - Node* input = argument(0); - Node* input_offset = argument(1); - Node* len = argument(2); - Node* acc = argument(3); - Node* r = argument(4); + Node* polyObj = argument(0); + Node* input = argument(1); + Node* input_offset = argument(2); + Node* len = argument(3); + + Node* accFace = load_field_from_object(polyObj, "a", "Lsun/security/util/math/MutableIntegerModuloP;"); + assert(accFace != NULL, "Accumulator field is null"); + const TypeInstPtr* ainst = _gvn.type(accFace)->isa_instptr(); + assert(ainst != NULL, "Accumulator obj is null"); + assert(ainst->is_loaded(), "MutableIntegerModuloP obj is not loaded"); + ciKlass* klass_MutableElement = ainst->instance_klass()->find_klass(ciSymbol::make("sun/security/util/math/intpoly/IntegerPolynomial$MutableElement")); + assert(klass_MutableElement != NULL, "IntegerPolynomial$MutableElement class is null"); + assert(klass_MutableElement->is_loaded(), "IntegerPolynomial$MutableElement class is not loaded"); + ciInstanceKlass* instklass_MutableElement = klass_MutableElement->as_instance_klass(); + + const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_MutableElement); + const TypeOopPtr* atype = aklass->as_instance_type()->cast_to_ptr_type(TypePtr::NotNull); + Node* accObj = new CheckCastPPNode(control(), accFace, atype); + accObj = _gvn.transform(accObj); + Node* alimbs = load_field_from_object(accObj, "limbs", "[J"); + + Node* rFace = load_field_from_object(polyObj, "r", "Lsun/security/util/math/IntegerModuloP;"); //this.r.limbs + assert(rFace != NULL, "R field is null"); + const TypeInstPtr* rinst = _gvn.type(rFace)->isa_instptr(); + assert(rinst != NULL, "R obj is null"); + assert(rinst->is_loaded(), "IntegerModuloP obj is not loaded"); + ciKlass* klass_ImmutableElement = rinst->instance_klass()->find_klass(ciSymbol::make("sun/security/util/math/intpoly/IntegerPolynomial$ImmutableElement")); + assert(klass_ImmutableElement != NULL, "IntegerPolynomial$ImmutableElement class is null"); + assert(klass_ImmutableElement->is_loaded(), "IntegerPolynomial$ImmutableElement class is not loaded"); + ciInstanceKlass* instklass_ImmutableElement = klass_ImmutableElement->as_instance_klass(); + + const TypeKlassPtr* rklass = TypeKlassPtr::make(instklass_ImmutableElement); + const TypeOopPtr* rtype = rklass->as_instance_type()->cast_to_ptr_type(TypePtr::NotNull); + Node* rObj = new CheckCastPPNode(control(), rFace, rtype); + rObj = _gvn.transform(rObj); + Node* rlimbs = load_field_from_object(rObj, "limbs", "[J"); input = must_be_not_null(input, true); - acc = must_be_not_null(acc, true); - r = must_be_not_null(r, true); + alimbs = must_be_not_null(alimbs, true); + rlimbs = must_be_not_null(rlimbs, true); + + // Intrinsic assumes there are exactly 5 limbs! Currently enforced by IntegerModuloP.checkLimbsForIntrinsic + // FIXME: where to branch to if limbs array length != 5? Could be an 'assert'/RuntimeException + // FIXME: repeat for rlimbs + // Node* cmp = _gvn.transform(new CmpINode(load_array_length(alimbs), intcon(5))); + // Node* bol = _gvn.transform(new BoolNode(cmp, BoolTest::eq)); + // Node* if_eq = generate_slow_guard(bol, slow_region); - Node* input_start = array_element_address(input, intcon(0), T_BYTE); + Node* input_start = array_element_address(input, input_offset, T_BYTE); assert(input_start, "input array is NULL"); - Node* acc_start = array_element_address(acc, intcon(0), T_BYTE); + Node* acc_start = array_element_address(alimbs, intcon(0), T_LONG); assert(acc_start, "acc array is NULL"); - Node* r_start = array_element_address(r, intcon(0), T_BYTE); + Node* r_start = array_element_address(rlimbs, intcon(0), T_LONG); assert(r_start, "r array is NULL"); Node* call = make_runtime_call(RC_LEAF, OptoRuntime::poly1305_processBlocks_Type(), stubAddr, stubName, TypePtr::BOTTOM, - input_start, input_offset, len, acc_start, r_start); + input_start, len, acc_start, r_start); return true; } diff --git a/src/hotspot/share/opto/runtime.cpp b/src/hotspot/share/opto/runtime.cpp index 2becd93527c37..7b08cab6748f8 100644 --- a/src/hotspot/share/opto/runtime.cpp +++ b/src/hotspot/share/opto/runtime.cpp @@ -1268,12 +1268,11 @@ const TypeFunc* OptoRuntime::base64_decodeBlock_Type() { // Poly1305 processMultipleBlocks function const TypeFunc* OptoRuntime::poly1305_processBlocks_Type() { - int argcnt = 5; + int argcnt = 4; const Type** fields = TypeTuple::fields(argcnt); int argp = TypeFunc::Parms; fields[argp++] = TypePtr::NOTNULL; // input array - fields[argp++] = TypeInt::INT; // input offset fields[argp++] = TypeInt::INT; // input length fields[argp++] = TypePtr::NOTNULL; // accumulator array fields[argp++] = TypePtr::NOTNULL; // r array diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index a8e834528061f..c335588ff8633 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -25,6 +25,7 @@ package com.sun.crypto.provider; +import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.security.Key; import java.security.InvalidKeyException; @@ -61,8 +62,10 @@ final class Poly1305 { private IntegerModuloP s; private MutableIntegerModuloP a; private final MutableIntegerModuloP n = ipl1305.get1().mutable(); + private final boolean checkWeakKey; - Poly1305() { } + Poly1305() { this(true); } + Poly1305(boolean checkKey) { checkWeakKey = checkKey; } /** * Initialize the Poly1305 object @@ -168,26 +171,13 @@ void engineUpdate(byte[] input, int offset, int len) { } } - if (len >= 1024) { - // Intrinsic code; need to extract a and r into bytes - // Choice of 1024 is arbitrary, need enough data blocks to amortize conversion overhead - // and not affect platforms without intrinsic support - int blockMultipleLength = len & (~(BLOCK_LENGTH-1)); - byte[] aBytes = this.a.asByteArray(BLOCK_LENGTH+1); - byte[] rBytes = this.r.asByteArray(BLOCK_LENGTH); - - processMultipleBlocksCheck(input, offset, blockMultipleLength, aBytes, rBytes); - processMultipleBlocks(input, offset, blockMultipleLength, aBytes, rBytes); - this.a.setValue(aBytes, 0, aBytes.length, (byte) 0); - offset += blockMultipleLength; - len -= blockMultipleLength; - } else { - while (len >= BLOCK_LENGTH) { - processBlock(input, offset, BLOCK_LENGTH); - offset += BLOCK_LENGTH; - len -= BLOCK_LENGTH; - } - } + int blockMultipleLength = len & (~(BLOCK_LENGTH-1)); + Objects.checkFromIndexSize(offset, blockMultipleLength, input.length); + a.checkLimbsForIntrinsic(); + r.checkLimbsForIntrinsic(); + processMultipleBlocks(input, offset, blockMultipleLength); + offset += blockMultipleLength; + len -= blockMultipleLength; if (len > 0) { // and len < BLOCK_LENGTH System.arraycopy(input, offset, block, 0, len); @@ -254,28 +244,16 @@ private void processBlock(byte[] block, int offset, int length) { a.setProduct(r); // a = (a * r) % p } - // Emulate intrinsic, no access to class variables, but means extra conversions @ForceInline @IntrinsicCandidate - private static void processMultipleBlocks(byte[] input, int offset, int length, byte[] aBytes, byte[] rBytes) { - MutableIntegerModuloP A = ipl1305.getElement(aBytes).mutable(); - IntegerModuloP R = ipl1305.getElement(rBytes).mutable(); - MutableIntegerModuloP temp = ipl1305.get0().mutable(); + private void processMultipleBlocks(byte[] input, int offset, int length) { while (length >= BLOCK_LENGTH) { - temp.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); - A.setSum(temp); // A += (temp | 0x01) - A.setProduct(R); // A = (A * R) % p + n.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); + a.setSum(n); // A += (temp | 0x01) + a.setProduct(r); // A = (A * R) % p offset += BLOCK_LENGTH; length -= BLOCK_LENGTH; } - - A.asByteArray(aBytes); - } - - private static void processMultipleBlocksCheck(byte[] input, int offset, int length, byte[] a, byte[] r) { - Objects.checkFromIndexSize(offset, length, input.length); - Objects.checkFromIndexSize(0, BLOCK_LENGTH+1, a.length); - Objects.checkFromIndexSize(0, BLOCK_LENGTH, r.length); } /** @@ -293,27 +271,26 @@ private void setRSVals() throws InvalidKeyException { keyBytes[8] &= (byte)252; keyBytes[12] &= (byte)252; - byte keyIsZero = 0; - for (int i = 0; i < RS_LENGTH; i++) { - keyIsZero |= keyBytes[i]; - } - if (keyIsZero == 0 && !katTesting) { - throw new InvalidKeyException("R is set to zero"); - } + if (checkWeakKey) { + byte keyIsZero = 0; + for (int i = 0; i < RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0) { + throw new InvalidKeyException("R is set to zero"); + } - keyIsZero = 0; - for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { - keyIsZero |= keyBytes[i]; - } - if (keyIsZero == 0 && !katTesting) { - throw new InvalidKeyException("S is set to zero"); + keyIsZero = 0; + for (int i = RS_LENGTH; i < 2*RS_LENGTH; i++) { + keyIsZero |= keyBytes[i]; + } + if (keyIsZero == 0) { + throw new InvalidKeyException("S is set to zero"); + } } // Create IntegerModuloP elements from the r and s values r = ipl1305.getElement(keyBytes, 0, RS_LENGTH, (byte)0); s = ipl1305.getElement(keyBytes, RS_LENGTH, RS_LENGTH, (byte)0); } - - // KAT testing expects R and/or S to be set to 0 for some tests - static boolean katTesting = false; } diff --git a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java index e6b0cff2c876c..996c6d795eadd 100644 --- a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java +++ b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java @@ -208,5 +208,10 @@ default ImmutableIntegerModuloP pow(BigInteger b) { return y.fixed(); } + /** + * Enforce java to IntrinsicCandidate 'contract' + * @throws IndexOutOfBoundsException if the check fails + */ + void checkLimbsForIntrinsic(); } diff --git a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java index 810c3fb3b86f8..43c67dfc136b7 100644 --- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java +++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java @@ -32,6 +32,8 @@ import java.nio.ByteOrder; import java.util.Arrays; +import jdk.internal.util.Preconditions; + /** * A large number polynomial representation using sparse limbs of signed * long (64-bit) values. Limb values will always fit within a long, so inputs @@ -626,6 +628,10 @@ public void asByteArray(byte[] result) { } limbsToByteArray(limbs, result); } + + public void checkLimbsForIntrinsic() { + Preconditions.checkFromIndexSize(0, numLimbs, limbs.length, null); + } } protected class MutableElement extends Element diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java index e5046cbef1d3b..649d1888c70b7 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305KAT.java @@ -143,8 +143,7 @@ public static void main(String args[]) throws Exception { } private static boolean runSingleTest(TestData testData) throws Exception { - Poly1305 authenticator = new Poly1305(); - authenticator.katTesting = true; + Poly1305 authenticator = new Poly1305(false); authenticator.engineInit(new SecretKeySpec(testData.key, 0, testData.key.length, "Poly1305"), null); authenticator.engineUpdate(testData.input, 0, testData.input.length); byte[] tag = authenticator.engineDoFinal(); diff --git a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java index ce9b8ac02d79e..aa45aa2e398a0 100644 --- a/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java +++ b/test/micro/org/openjdk/bench/javax/crypto/full/Poly1305DigestBench.java @@ -36,8 +36,10 @@ import javax.crypto.spec.SecretKeySpec; import org.openjdk.jmh.annotations.Fork; import org.openjdk.jmh.annotations.Warmup; +import org.openjdk.jmh.annotations.Measurement; -@Warmup(time=3, iterations=5) // Reflection needs time to be upgraded to bytecodes +@Measurement(iterations = 3, time = 10) +@Warmup(iterations = 3, time = 10) @Fork(value = 1, jvmArgsAppend = {"--add-opens", "java.base/com.sun.crypto.provider=ALL-UNNAMED"}) public class Poly1305DigestBench extends CryptoBase { public static final int SET_SIZE = 128; From 1841df110f4dd1fa1069468158612f04162f6aa8 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 8 Nov 2022 15:47:32 -0500 Subject: [PATCH 09/23] iwanowww review --- src/hotspot/cpu/x86/macroAssembler_x86.hpp | 13 - src/hotspot/cpu/x86/stubGenerator_x86_64.cpp | 89 -- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 12 +- ...86_poly.cpp => stubGenerator_x86_poly.cpp} | 847 ++++++++++-------- src/hotspot/cpu/x86/stubRoutines_x86.cpp | 1 - src/hotspot/cpu/x86/stubRoutines_x86.hpp | 2 - src/hotspot/share/opto/library_call.cpp | 2 +- 7 files changed, 464 insertions(+), 502 deletions(-) rename src/hotspot/cpu/x86/{macroAssembler_x86_poly.cpp => stubGenerator_x86_poly.cpp} (53%) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 6390b3d9dcf7a..5cd0efadac561 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -968,18 +968,6 @@ class MacroAssembler: public Assembler { void addmq(int disp, Register r1, Register r2); - void poly1305_process_blocks_avx512(const Register input, const Register length, - const Register A0, const Register A1, const Register A2, - const Register R0, const Register R1, const Register C1); - void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, - const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP); - void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, - const Register R0, const Register R1, const Register C1, bool only128); - void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); - void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128); - void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs); - public: void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, @@ -990,7 +978,6 @@ class MacroAssembler: public Assembler { Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block, XMMRegister shuf_mask); - void poly1305_process_blocks(Register input, Register length, Register accumulator, Register R); #endif // _LP64 void fast_md5(Register buf, Address state, Address ofs, Address limit, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp index 8e7259f2eb887..eba62f9d4c906 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp @@ -1999,94 +1999,6 @@ address StubGenerator::generate_base64_encodeBlock() return start; } -address StubGenerator::generate_poly1305_masksCP() { - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "generate_poly1305_masksCP"); - address start = __ pc(); - // OFFSET 0: high_bit - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - __ emit_data64(0x0000010000000000, relocInfo::none); - - // OFFSET 64: mask_44 - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - __ emit_data64(0xfffffffffff, relocInfo::none); - - // OFFSET 128: mask_42 - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - __ emit_data64(0x3ffffffffff, relocInfo::none); - - return start; -} - -address StubGenerator::generate_poly1305_processBlocks() { - __ align(CodeEntryAlignment); - StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); - address start = __ pc(); - __ enter(); - - // Save all 'SOE' registers - __ push(rbx); - #ifdef _WIN64 - __ push(rsi); - __ push(rdi); - #endif - __ push(r12); - __ push(r13); - __ push(r14); - __ push(r15); - - // void processBlocks(byte[] input, int len, int[5] a, int[5] r) - const Register input = rdi; //input+offset - const Register length = rbx; - const Register accumulator = rcx; - const Register R = r8; - - #ifdef _WIN64 - __ mov(input, c_rarg0); - __ mov(length, c_rarg1); - __ mov(accumulator, c_rarg2); - __ mov(R, c_rarg3); - #else // input already in correct position for linux; dont clobber R, args copied out-of-order - __ mov(length, c_rarg1); - __ mov(R, c_rarg3); - __ mov(accumulator, c_rarg2); - #endif - - __ poly1305_process_blocks(input, length, accumulator, R); - - __ pop(r15); - __ pop(r14); - __ pop(r13); - __ pop(r12); - #ifdef _WIN64 - __ pop(rdi); - __ pop(rsi); - #endif - __ pop(rbx); - - __ leave(); - __ ret(0); - return start; -} - // base64 AVX512vbmi tables address StubGenerator::base64_vbmi_lookup_lo_addr() { __ align64(); @@ -3798,7 +3710,6 @@ void StubGenerator::generate_initial() { } if (UsePolyIntrinsics) { - StubRoutines::x86::_poly1305_mask_addr = generate_poly1305_masksCP(); StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks(); } diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index d9600aa0f1bbf..c4521dda09562 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -388,8 +388,18 @@ class StubGenerator: public StubCodeGenerator { address generate_avx_ghash_processBlocks(); // Poly1305 multiblock using IFMA instructions - address generate_poly1305_masksCP(); address generate_poly1305_processBlocks(); + void poly1305_process_blocks_avx512(const Register input, const Register length, + const Register A0, const Register A1, const Register A2, + const Register R0, const Register R1, const Register C1); + void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, + const Register R0, const Register R1, const Register C1, bool only128); + void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128); + void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs); + void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_poly.cpp similarity index 53% rename from src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp rename to src/hotspot/cpu/x86/stubGenerator_x86_poly.cpp index 00d51bc25e797..68e5a00236dc2 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_poly.cpp @@ -24,12 +24,11 @@ */ #include "precompiled.hpp" -#include "asm/assembler.hpp" -#include "asm/assembler.inline.hpp" -#include "runtime/stubRoutines.hpp" #include "macroAssembler_x86.hpp" +#include "stubGenerator_x86_64.hpp" + +#define __ _masm-> -#ifdef _LP64 // References: // - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols // - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" @@ -79,12 +78,34 @@ // B: xmm19-24 // R: xmm25-29 -// Constant Pool OFfsets: +// Constant Pool Offsets: enum polyCPOffset { high_bit = 0, mask_44 = 64, mask_42 = 128, }; +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_CP[] = { + // OFFSET 0: high_bit + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, + 0x0000010000000000, 0x0000010000000000, + + // OFFSET 64: mask_44 + 0xfffffffffff, 0xfffffffffff, + 0xfffffffffff, 0xfffffffffff, + 0xfffffffffff, 0xfffffffffff, + 0xfffffffffff, 0xfffffffffff, + + // OFFSET 128: mask_42 + 0x3ffffffffff, 0x3ffffffffff, + 0x3ffffffffff, 0x3ffffffffff, + 0x3ffffffffff, 0x3ffffffffff, + 0x3ffffffffff, 0x3ffffffffff +}; +static address poly1305_mask_addr() { + return (address)POLY1305_CP; +} // Compute product for 8 16-byte message blocks, // i.e. For each block, compute [a2 a1 a0] = [a2 a1 a0] × [r2 r1 r0] @@ -123,7 +144,7 @@ enum polyCPOffset { // = (a2×[5×r2×4] × 2^44) mod 2^130-5 // = (a2×R2P × 2^44) mod 2^130-5 // i.e. R2P = 4*5*R2 // -void MacroAssembler::poly1305_multiply8_avx512( +void StubGenerator::poly1305_multiply8_avx512( const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP) { @@ -136,34 +157,34 @@ void MacroAssembler::poly1305_multiply8_avx512( const XMMRegister TMP1 = xmm6; // Reset partial sums - evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); - evpxorq(P0_H, P0_H, P0_H, Assembler::AVX_512bit); - evpxorq(P1_L, P1_L, P1_L, Assembler::AVX_512bit); - evpxorq(P1_H, P1_H, P1_H, Assembler::AVX_512bit); - evpxorq(P2_L, P2_L, P2_L, Assembler::AVX_512bit); - evpxorq(P2_H, P2_H, P2_H, Assembler::AVX_512bit); + __ evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); + __ evpxorq(P0_H, P0_H, P0_H, Assembler::AVX_512bit); + __ evpxorq(P1_L, P1_L, P1_L, Assembler::AVX_512bit); + __ evpxorq(P1_H, P1_H, P1_H, Assembler::AVX_512bit); + __ evpxorq(P2_L, P2_L, P2_L, Assembler::AVX_512bit); + __ evpxorq(P2_H, P2_H, P2_H, Assembler::AVX_512bit); // Calculate partial products - evpmadd52luq(P0_L, A2, R1P, Assembler::AVX_512bit); - evpmadd52huq(P0_H, A2, R1P, Assembler::AVX_512bit); - evpmadd52luq(P1_L, A2, R2P, Assembler::AVX_512bit); - evpmadd52huq(P1_H, A2, R2P, Assembler::AVX_512bit); - evpmadd52luq(P2_L, A2, R0, Assembler::AVX_512bit); - evpmadd52huq(P2_H, A2, R0, Assembler::AVX_512bit); - - evpmadd52luq(P1_L, A0, R1, Assembler::AVX_512bit); - evpmadd52huq(P1_H, A0, R1, Assembler::AVX_512bit); - evpmadd52luq(P2_L, A0, R2, Assembler::AVX_512bit); - evpmadd52huq(P2_H, A0, R2, Assembler::AVX_512bit); - evpmadd52luq(P0_L, A0, R0, Assembler::AVX_512bit); - evpmadd52huq(P0_H, A0, R0, Assembler::AVX_512bit); - - evpmadd52luq(P0_L, A1, R2P, Assembler::AVX_512bit); - evpmadd52huq(P0_H, A1, R2P, Assembler::AVX_512bit); - evpmadd52luq(P1_L, A1, R0, Assembler::AVX_512bit); - evpmadd52huq(P1_H, A1, R0, Assembler::AVX_512bit); - evpmadd52luq(P2_L, A1, R1, Assembler::AVX_512bit); - evpmadd52huq(P2_H, A1, R1, Assembler::AVX_512bit); + __ evpmadd52luq(P0_L, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52huq(P0_H, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52luq(P1_L, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P1_H, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P2_L, A2, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P2_H, A2, R0, Assembler::AVX_512bit); + + __ evpmadd52luq(P1_L, A0, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P1_H, A0, R1, Assembler::AVX_512bit); + __ evpmadd52luq(P2_L, A0, R2, Assembler::AVX_512bit); + __ evpmadd52huq(P2_H, A0, R2, Assembler::AVX_512bit); + __ evpmadd52luq(P0_L, A0, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P0_H, A0, R0, Assembler::AVX_512bit); + + __ evpmadd52luq(P0_L, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P0_H, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P1_L, A1, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P1_H, A1, R0, Assembler::AVX_512bit); + __ evpmadd52luq(P2_L, A1, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P2_H, A1, R1, Assembler::AVX_512bit); // Carry propagation: // (Not quite aligned) | More mathematically correct: @@ -172,32 +193,32 @@ void MacroAssembler::poly1305_multiply8_avx512( // --------------------------- | ----------------------------------------------- // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // - vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); - evpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); + __ evpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); - vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); - vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); - evpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); + __ vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); + __ vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); + __ evpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); - vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); - vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); - vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); - evpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + __ vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); + __ vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); + __ vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); + __ vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); + __ evpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits - vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); - vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); - vpaddq(P2_H, P2_H, TMP1, Assembler::AVX_512bit); + __ vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); + __ vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); + __ vpaddq(P2_H, P2_H, TMP1, Assembler::AVX_512bit); // Reduction: p2->a0->a1 // Multiply by 5 the highest bits (p2 is above 130 bits) - vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); - vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); - vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); - vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); - evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); - vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); + __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); + __ vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); + __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); + __ vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); + __ evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); + __ vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); } // Compute product for a single 16-byte message blocks @@ -226,7 +247,7 @@ void MacroAssembler::poly1305_multiply8_avx512( // a0 = L0L // a1 = L0H + L1L // t3 = L1H + L2L -void MacroAssembler::poly1305_multiply_scalar( +void StubGenerator::poly1305_multiply_scalar( const Register a0, const Register a1, const Register a2, const Register r0, const Register r1, const Register c1, bool only128) { @@ -236,28 +257,28 @@ void MacroAssembler::poly1305_multiply_scalar( // Note mulq instruction requires/clobers rax, rdx // t3:t2 = (a0 * r1) - movq(rax, r1); - mulq(a0); - movq(t2, rax); - movq(t3, rdx); + __ movq(rax, r1); + __ mulq(a0); + __ movq(t2, rax); + __ movq(t3, rdx); // t1:a0 = (a0 * r0) - movq(rax, r0); - mulq(a0); - movq(a0, rax); // a0 not used in other operations - movq(t1, rdx); + __ movq(rax, r0); + __ mulq(a0); + __ movq(a0, rax); // a0 not used in other operations + __ movq(t1, rdx); // t3:t2 += (a1 * r0) - movq(rax, r0); - mulq(a1); - addq(t2, rax); - adcq(t3, rdx); + __ movq(rax, r0); + __ mulq(a1); + __ addq(t2, rax); + __ adcq(t3, rdx); // t1:a0 += (a1 * r1x5) - movq(rax, c1); - mulq(a1); - addq(a0, rax); - adcq(t1, rdx); + __ movq(rax, c1); + __ mulq(a1); + __ addq(a0, rax); + __ adcq(t1, rdx); // Note: a2 is clamped to 2-bits, // r1/r0 is clamped to 60-bits, @@ -265,22 +286,22 @@ void MacroAssembler::poly1305_multiply_scalar( if (only128) { // Accumulator only 128 bits, i.e. a2 == 0 // just move and add t1-t2 to a1 - movq(a1, t1); - addq(a1, t2); - adcq(t3, 0); + __ movq(a1, t1); + __ addq(a1, t2); + __ adcq(t3, 0); } else { // t3:t2 += (a2 * r1x5) - movq(a1, a2); // use a1 for a2 - imulq(a1, c1); - addq(t2, a1); - adcq(t3, 0); + __ movq(a1, a2); // use a1 for a2 + __ imulq(a1, c1); + __ addq(t2, a1); + __ adcq(t3, 0); - movq(a1, t1); // t1:a0 => a1:a0 + __ movq(a1, t1); // t1:a0 => a1:a0 // t3:a1 += (a2 * r0):t2 - imulq(a2, r0); - addq(a1, t2); - adcq(t3, a2); + __ imulq(a2, r0); + __ addq(a1, t2); + __ adcq(t3, a2); } // At this point, 3 64-bit limbs are in t3:a1:a0 @@ -293,17 +314,17 @@ void MacroAssembler::poly1305_multiply_scalar( // a2:a1:a0 += k // // Result will be in a2:a1:a0 - movq(t1, t3); - movl(a2, t3); // DWORD - andq(t1, ~3); - shrq(t3, 2); - addq(t1, t3); - andl(a2, 3); // DWORD + __ movq(t1, t3); + __ movl(a2, t3); // DWORD + __ andq(t1, ~3); + __ shrq(t3, 2); + __ addq(t1, t3); + __ andl(a2, 3); // DWORD // a2:a1:a0 += k (kept in t1) - addq(a0, t1); - adcq(a1, 0); - adcl(a2, 0); // DWORD + __ addq(a0, t1); + __ adcq(a1, 0); + __ adcl(a2, 0); // DWORD } // Convert array of 128-bit numbers in quadwords (in D0:D1) into 128-bit numbers across 44-bit limbs (in L0:L1:L2) @@ -322,29 +343,29 @@ void MacroAssembler::poly1305_multiply_scalar( // L0 | h0 d0 g0 c0 f0 b0 e0 a0 | // +-------------------------+ // -void MacroAssembler::poly1305_limbs_avx512( +void StubGenerator::poly1305_limbs_avx512( const XMMRegister D0, const XMMRegister D1, const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP) { const XMMRegister TMP1 = xmm0; const XMMRegister TMP2 = xmm1; // Interleave blocks of data - evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); - evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); + __ evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); + __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); // Highest 42-bit limbs of new blocks - vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); + __ vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); if (padMSG) { - evporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message + __ evporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message } // Middle 44-bit limbs of new blocks - vpsrlq(L1, L0, 44, Assembler::AVX_512bit); - vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); - vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) + __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit); + __ vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); + __ vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) // Lowest 44-bit limbs of new blocks - evpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); + __ evpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); } /** @@ -352,100 +373,99 @@ void MacroAssembler::poly1305_limbs_avx512( * * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) */ -void MacroAssembler::poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128) +void StubGenerator::poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128) { const Register t1 = r13; const Register t2 = r14; - movq(a0, Address(limbs, 0)); - movq(t1, Address(limbs, 8)); - shlq(t1, 26); - addq(a0, t1); - movq(t1, Address(limbs, 16)); - movq(t2, Address(limbs, 24)); - movq(a1, t1); - shlq(t1, 52); - shrq(a1, 12); - shlq(t2, 14); - addq(a0, t1); - adcq(a1, t2); - movq(t1, Address(limbs, 32)); + __ movq(a0, Address(limbs, 0)); + __ movq(t1, Address(limbs, 8)); + __ shlq(t1, 26); + __ addq(a0, t1); + __ movq(t1, Address(limbs, 16)); + __ movq(t2, Address(limbs, 24)); + __ movq(a1, t1); + __ shlq(t1, 52); + __ shrq(a1, 12); + __ shlq(t2, 14); + __ addq(a0, t1); + __ adcq(a1, t2); + __ movq(t1, Address(limbs, 32)); if (!only128) { - movq(a2, t1); - shrq(a2, 24); + __ movq(a2, t1); + __ shrq(a2, 24); } - shlq(t1, 40); - addq(a1, t1); + __ shlq(t1, 40); + __ addq(a1, t1); if (only128) { return; } - adcq(a2, 0); + __ adcq(a2, 0); // One round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 - movq(t1, a2); - andq(t1, ~3); - andq(a2, 3); - movq(t2, t1); - shrq(t2, 2); - addq(t1, t2); - - addq(a0, t1); - adcq(a1, 0); - adcq(a2, 0); + __ movq(t1, a2); + __ andq(t1, ~3); + __ andq(a2, 3); + __ movq(t2, t1); + __ shrq(t2, 2); + __ addq(t1, t2); + + __ addq(a0, t1); + __ adcq(a1, 0); + __ adcq(a2, 0); } - /** * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs` */ -void MacroAssembler::poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs) +void StubGenerator::poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs) { const Register t1 = r13; const Register t2 = r14; // Extra round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 - movq(t1, a2); - andq(t1, ~3); - andq(a2, 3); - movq(t2, t1); - shrq(t2, 2); - addq(t1, t2); + __ movq(t1, a2); + __ andq(t1, ~3); + __ andq(a2, 3); + __ movq(t2, t1); + __ shrq(t2, 2); + __ addq(t1, t2); - addq(a0, t1); - adcq(a1, 0); - adcq(a2, 0); + __ addq(a0, t1); + __ adcq(a1, 0); + __ adcq(a2, 0); // Chop a2:a1:a0 into 26-bit limbs - movl(t1, a0); - andl(t1, 0x3ffffff); - movq(Address(limbs, 0), t1); - - shrq(a0, 26); - movl(t1, a0); - andl(t1, 0x3ffffff); - movq(Address(limbs, 8), t1); - - shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1 - movl(t1, a1); - shll(t1, 12); - addl(t1, a0); - andl(t1, 0x3ffffff); - movq(Address(limbs, 16), t1); - - shrq(a1, 14); // already used up 14 bits - shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced - addq(a1, a2); // put remaining bits into a1 - - movl(t1, a1); - andl(t1, 0x3ffffff); - movq(Address(limbs, 24), t1); - - shrq(a1, 26); - movl(t1, a1); + __ movl(t1, a0); + __ andl(t1, 0x3ffffff); + __ movq(Address(limbs, 0), t1); + + __ shrq(a0, 26); + __ movl(t1, a0); + __ andl(t1, 0x3ffffff); + __ movq(Address(limbs, 8), t1); + + __ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1 + __ movl(t1, a1); + __ shll(t1, 12); + __ addl(t1, a0); + __ andl(t1, 0x3ffffff); + __ movq(Address(limbs, 16), t1); + + __ shrq(a1, 14); // already used up 14 bits + __ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced + __ addq(a1, a2); // put remaining bits into a1 + + __ movl(t1, a1); + __ andl(t1, 0x3ffffff); + __ movq(Address(limbs, 24), t1); + + __ shrq(a1, 26); + __ movl(t1, a1); //andl(t1, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s) - movq(Address(limbs, 32), t1); + __ movq(Address(limbs, 32), t1); } // This function consumes as many whole 16*16-byte blocks as available in input @@ -527,7 +547,7 @@ void MacroAssembler::poly1305_limbs_out(const Register a0, const Register a1, co // T = A >> 1 // 2 ->1 blocks // A = A + T // a = A -void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const Register length, +void StubGenerator::poly1305_process_blocks_avx512(const Register input, const Register length, const Register a0, const Register a1, const Register a2, const Register r0, const Register r1, const Register c1) { @@ -570,22 +590,22 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const const XMMRegister R1P = xmm28; const XMMRegister R2P = xmm29; - subq(rsp, 512/8*6); // Make room to store 6 zmm registers (powers of R) - lea(polyCP, ExternalAddress(StubRoutines::x86::poly1305_mask_addr())); + __ subq(rsp, 512/8*6); // Make room to store 6 zmm registers (powers of R) + __ lea(polyCP, ExternalAddress(poly1305_mask_addr())); // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 - movq(t0, a0); - andq(t0, Address(polyCP, mask_44)); // First limb (Acc[43:0]) - movq(C0, t0); + __ movq(t0, a0); + __ andq(t0, Address(polyCP, mask_44)); // First limb (Acc[43:0]) + __ movq(C0, t0); - movq(t0, a1); - shrdq(a0, t0, 44); - andq(a0, Address(polyCP, mask_44)); // Second limb (Acc[77:52]) - movq(C1, a0); + __ movq(t0, a1); + __ shrdq(a0, t0, 44); + __ andq(a0, Address(polyCP, mask_44)); // Second limb (Acc[77:52]) + __ movq(C1, a0); - shrdq(a1, a2, 24); - andq(a1, Address(polyCP, mask_42)); // Third limb (Acc[129:88]) - movq(C2, a1); + __ shrdq(a1, a2, 24); + __ andq(a1, Address(polyCP, mask_42)); // Third limb (Acc[129:88]) + __ movq(C2, a1); // To add accumulator, we must unroll first loop iteration @@ -593,104 +613,104 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const // A0 to have bits 0-43 of all 8 blocks in 8 qwords // A1 to have bits 87-44 of all 8 blocks in 8 qwords // A2 to have bits 127-88 of all 8 blocks in 8 qwords - evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); - evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); + __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); poly1305_limbs_avx512(T0, T1, A0, A1, A2, true, polyCP); // Add accumulator to the fist message block - vpaddq(A0, A0, C0, Assembler::AVX_512bit); - vpaddq(A1, A1, C1, Assembler::AVX_512bit); - vpaddq(A2, A2, C2, Assembler::AVX_512bit); + __ vpaddq(A0, A0, C0, Assembler::AVX_512bit); + __ vpaddq(A1, A1, C1, Assembler::AVX_512bit); + __ vpaddq(A2, A2, C2, Assembler::AVX_512bit); // Load next blocks of data (128 bytes) and pad // A3 to have bits 0-43 of all 8 blocks in 8 qwords // A4 to have bits 87-44 of all 8 blocks in 8 qwords // A5 to have bits 127-88 of all 8 blocks in 8 qwords - evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); - evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); + __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, polyCP); - subl(length, 16*16); - lea(input, Address(input,16*16)); + __ subl(length, 16*16); + __ lea(input, Address(input,16*16)); // Compute the powers of R^1..R^4 and form 44-bit limbs of each // T0 to have bits 0-127 in 4 quadword pairs // T1 to have bits 128-129 in alternating 8 qwords - vpxorq(T1, T1, T1, Assembler::AVX_512bit); - movq(T2, r0); - vpinsrq(T2, T2, r1, 1); - vinserti32x4(T0, T0, T2, 3); + __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); + __ movq(T2, r0); + __ vpinsrq(T2, T2, r1, 1); + __ vinserti32x4(T0, T0, T2, 3); // Calculate R^2 - movq(a0, r0); - movq(a1, r1); + __ movq(a0, r0); + __ movq(a1, r1); // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, true); - movq(T2, a0); - vpinsrq(T2, T2, a1, 1); - vinserti32x4(T0, T0, T2, 2); - movq(T2, a2); - vinserti32x4(T1, T1, T2, 2); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 2); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 2); // Calculate R^3 poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); - movq(T2, a0); - vpinsrq(T2, T2, a1, 1); - vinserti32x4(T0, T0, T2, 1); - movq(T2, a2); - vinserti32x4(T1, T1, T2, 1); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 1); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 1); // Calculate R^4 poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); - movq(T2, a0); - vpinsrq(T2, T2, a1, 1); - vinserti32x4(T0, T0, T2, 0); - movq(T2, a2); - vinserti32x4(T1, T1, T2, 0); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 0); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 0); // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords - lea(polyCP, ExternalAddress(StubRoutines::x86::poly1305_mask_addr())); - vpxorq(T2, T2, T2, Assembler::AVX_512bit); + __ lea(polyCP, ExternalAddress(poly1305_mask_addr())); + __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, polyCP); // T1 contains the 2 highest bits of the powers of R - vpsllq(T1, T1, 40, Assembler::AVX_512bit); - evporq(B2, B2, T1, Assembler::AVX_512bit); + __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); + __ evporq(B2, B2, T1, Assembler::AVX_512bit); // Broadcast 44-bit limbs of R^4 into R0,R1,R2 - mov(t0, a0); - andq(t0, Address(polyCP, mask_44)); // First limb (R^4[43:0]) - evpbroadcastq(R0, t0, Assembler::AVX_512bit); + __ mov(t0, a0); + __ andq(t0, Address(polyCP, mask_44)); // First limb (R^4[43:0]) + __ evpbroadcastq(R0, t0, Assembler::AVX_512bit); - movq(t0, a1); - shrdq(a0, t0, 44); - andq(a0, Address(polyCP, mask_44)); // Second limb (R^4[87:44]) - evpbroadcastq(R1, a0, Assembler::AVX_512bit); + __ movq(t0, a1); + __ shrdq(a0, t0, 44); + __ andq(a0, Address(polyCP, mask_44)); // Second limb (R^4[87:44]) + __ evpbroadcastq(R1, a0, Assembler::AVX_512bit); - shrdq(a1, a2, 24); - andq(a1, Address(polyCP, mask_42)); // Third limb (R^4[129:88]) - evpbroadcastq(R2, a1, Assembler::AVX_512bit); + __ shrdq(a1, a2, 24); + __ andq(a1, Address(polyCP, mask_42)); // Third limb (R^4[129:88]) + __ evpbroadcastq(R2, a1, Assembler::AVX_512bit); // Generate 4*5*R^4 into {R2P,R1P} // Used as multiplier in poly1305_multiply8_avx512 so can // ignore bottom limb and carry propagation - vpsllq(R1P, R1, 2, Assembler::AVX_512bit); // 4*R^4 - vpsllq(R2P, R2, 2, Assembler::AVX_512bit); - vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^4 - vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); - vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^4 - vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); // 4*R^4 + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^4 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^4 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); // Move R^4..R^1 one element over - vpslldq(C0, B0, 8, Assembler::AVX_512bit); - vpslldq(C1, B1, 8, Assembler::AVX_512bit); - vpslldq(C2, B2, 8, Assembler::AVX_512bit); + __ vpslldq(C0, B0, 8, Assembler::AVX_512bit); + __ vpslldq(C1, B1, 8, Assembler::AVX_512bit); + __ vpslldq(C2, B2, 8, Assembler::AVX_512bit); // Calculate R^8-R^5 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 @@ -698,27 +718,27 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const polyCP); // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R - evporq(B0, B0, C0, Assembler::AVX_512bit); - evporq(B1, B1, C1, Assembler::AVX_512bit); - evporq(B2, B2, C2, Assembler::AVX_512bit); + __ evporq(B0, B0, C0, Assembler::AVX_512bit); + __ evporq(B1, B1, C1, Assembler::AVX_512bit); + __ evporq(B2, B2, C2, Assembler::AVX_512bit); // Broadcast R^8 - vpbroadcastq(R0, B0, Assembler::AVX_512bit); - vpbroadcastq(R1, B1, Assembler::AVX_512bit); - vpbroadcastq(R2, B2, Assembler::AVX_512bit); + __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); + __ vpbroadcastq(R1, B1, Assembler::AVX_512bit); + __ vpbroadcastq(R2, B2, Assembler::AVX_512bit); // Generate 4*5*R^8 - vpsllq(R1P, R1, 2, Assembler::AVX_512bit); - vpsllq(R2P, R2, 2, Assembler::AVX_512bit); - vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^8 - vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); - vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 - vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); // Store R^8-R for later use - evmovdquq(Address(rsp, 64*0), B0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*1), B1, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*2), B2, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*0), B0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*1), B1, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*2), B2, Assembler::AVX_512bit); // Calculate R^16-R^9 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 @@ -726,36 +746,36 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const polyCP); // Store R^16-R^9 for later use - evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*4), B1, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*5), B2, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*4), B1, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*5), B2, Assembler::AVX_512bit); // Broadcast R^16 - vpbroadcastq(R0, B0, Assembler::AVX_512bit); - vpbroadcastq(R1, B1, Assembler::AVX_512bit); - vpbroadcastq(R2, B2, Assembler::AVX_512bit); + __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); + __ vpbroadcastq(R1, B1, Assembler::AVX_512bit); + __ vpbroadcastq(R2, B2, Assembler::AVX_512bit); // Generate 4*5*R^16 - vpsllq(R1P, R1, 2, Assembler::AVX_512bit); - vpsllq(R2P, R2, 2, Assembler::AVX_512bit); - vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^16 - vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); - vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16 - vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, R1, Assembler::AVX_512bit); // 5*R^16 + __ vpaddq(R2P, R2P, R2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^16 + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); // VECTOR LOOP: process 16 * 16-byte message block at a time - bind(L_process256Loop); - cmpl(length, 16*16); - jcc(Assembler::less, L_process256LoopDone); + __ bind(L_process256Loop); + __ cmpl(length, 16*16); + __ jcc(Assembler::less, L_process256LoopDone); // Load and interleave next block of data (128 bytes) - evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); - evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); + __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); poly1305_limbs_avx512(T0, T1, B0, B1, B2, true, polyCP); // Load and interleave next block of data (128 bytes) - evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); - evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); + __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); poly1305_limbs_avx512(T0, T1, B3, B4, B5, true, polyCP); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks @@ -765,44 +785,44 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const R0, R1, R2, R1P, R2P, //R^16..R^16, 4*5*R^16 polyCP); - vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator - vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator - vpaddq(A2, A2, B2, Assembler::AVX_512bit); //Add highest bits from new blocks to accumulator - vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator - vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator - vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator + __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); //Add highest bits from new blocks to accumulator + __ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator + __ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator + __ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator - subl(length, 16*16); - lea(input, Address(input,16*16)); - jmp(L_process256Loop); + __ subl(length, 16*16); + __ lea(input, Address(input,16*16)); + __ jmp(L_process256Loop); - bind(L_process256LoopDone); + __ bind(L_process256LoopDone); // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value // Read R^16-R^9 - evmovdquq(B0, Address(rsp, 64*3), Assembler::AVX_512bit); - evmovdquq(B1, Address(rsp, 64*4), Assembler::AVX_512bit); - evmovdquq(B2, Address(rsp, 64*5), Assembler::AVX_512bit); + __ evmovdquq(B0, Address(rsp, 64*3), Assembler::AVX_512bit); + __ evmovdquq(B1, Address(rsp, 64*4), Assembler::AVX_512bit); + __ evmovdquq(B2, Address(rsp, 64*5), Assembler::AVX_512bit); // Read R^8-R - evmovdquq(R0, Address(rsp, 64*0), Assembler::AVX_512bit); - evmovdquq(R1, Address(rsp, 64*1), Assembler::AVX_512bit); - evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); + __ evmovdquq(R0, Address(rsp, 64*0), Assembler::AVX_512bit); + __ evmovdquq(R1, Address(rsp, 64*1), Assembler::AVX_512bit); + __ evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); // Generate 4*5*[R^16..R^9] (ignore lowest limb) - vpsllq(T0, B1, 2, Assembler::AVX_512bit); - vpaddq(B3, B1, T0, Assembler::AVX_512bit); // R1' (R1*5) - vpsllq(T0, B2, 2, Assembler::AVX_512bit); - vpaddq(B4, B2, T0, Assembler::AVX_512bit); // R2' (R2*5) - vpsllq(B3, B3, 2, Assembler::AVX_512bit); // 4*5*R - vpsllq(B4, B4, 2, Assembler::AVX_512bit); + __ vpsllq(T0, B1, 2, Assembler::AVX_512bit); + __ vpaddq(B3, B1, T0, Assembler::AVX_512bit); // R1' (R1*5) + __ vpsllq(T0, B2, 2, Assembler::AVX_512bit); + __ vpaddq(B4, B2, T0, Assembler::AVX_512bit); // R2' (R2*5) + __ vpsllq(B3, B3, 2, Assembler::AVX_512bit); // 4*5*R + __ vpsllq(B4, B4, 2, Assembler::AVX_512bit); // Generate 4*5*[R^8..R^1] (ignore lowest limb) - vpsllq(T0, R1, 2, Assembler::AVX_512bit); - vpaddq(R1P, R1, T0, Assembler::AVX_512bit); // R1' (R1*5) - vpsllq(T0, R2, 2, Assembler::AVX_512bit); - vpaddq(R2P, R2, T0, Assembler::AVX_512bit); // R2' (R2*5) - vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R - vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); + __ vpsllq(T0, R1, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1, T0, Assembler::AVX_512bit); // R1' (R1*5) + __ vpsllq(T0, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R2P, R2, T0, Assembler::AVX_512bit); // R2' (R2*5) + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R + __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks B0, B1, B2, B3, B4, // R^16-R^9, R1P, R2P @@ -813,112 +833,137 @@ void MacroAssembler::poly1305_process_blocks_avx512(const Register input, const // Add all blocks (horizontally) // 16->8 blocks - vpaddq(A0, A0, A3, Assembler::AVX_512bit); - vpaddq(A1, A1, A4, Assembler::AVX_512bit); - vpaddq(A2, A2, A5, Assembler::AVX_512bit); + __ vpaddq(A0, A0, A3, Assembler::AVX_512bit); + __ vpaddq(A1, A1, A4, Assembler::AVX_512bit); + __ vpaddq(A2, A2, A5, Assembler::AVX_512bit); // 8 -> 4 blocks - vextracti64x4(T0, A0, 1); - vextracti64x4(T1, A1, 1); - vextracti64x4(T2, A2, 1); - vpaddq(A0, A0, T0, Assembler::AVX_256bit); - vpaddq(A1, A1, T1, Assembler::AVX_256bit); - vpaddq(A2, A2, T2, Assembler::AVX_256bit); + __ vextracti64x4(T0, A0, 1); + __ vextracti64x4(T1, A1, 1); + __ vextracti64x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_256bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_256bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_256bit); // 4 -> 2 blocks - vextracti32x4(T0, A0, 1); - vextracti32x4(T1, A1, 1); - vextracti32x4(T2, A2, 1); - vpaddq(A0, A0, T0, Assembler::AVX_128bit); - vpaddq(A1, A1, T1, Assembler::AVX_128bit); - vpaddq(A2, A2, T2, Assembler::AVX_128bit); + __ vextracti32x4(T0, A0, 1); + __ vextracti32x4(T1, A1, 1); + __ vextracti32x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_128bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_128bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_128bit); // 2 -> 1 blocks - vpsrldq(T0, A0, 8, Assembler::AVX_128bit); - vpsrldq(T1, A1, 8, Assembler::AVX_128bit); - vpsrldq(T2, A2, 8, Assembler::AVX_128bit); + __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit); + __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit); + __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit); // Finish folding and clear second qword - mov64(t0, 0xfd); - kmovql(k1, t0); - evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); - evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); - evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); + __ mov64(t0, 0xfd); + __ kmovql(k1, t0); + __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); + __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); + __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); // Carry propagation - vpsrlq(T0, A0, 44, Assembler::AVX_512bit); - evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - vpaddq(A1, A1, T0, Assembler::AVX_512bit); - vpsrlq(T0, A1, 44, Assembler::AVX_512bit); - evpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits - vpaddq(A2, A2, T0, Assembler::AVX_512bit); - vpsrlq(T0, A2, 42, Assembler::AVX_512bit); - evpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits - vpsllq(T1, T0, 2, Assembler::AVX_512bit); - vpaddq(T0, T0, T1, Assembler::AVX_512bit); - vpaddq(A0, A0, T0, Assembler::AVX_512bit); + __ vpsrlq(T0, A0, 44, Assembler::AVX_512bit); + __ evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ vpaddq(A1, A1, T0, Assembler::AVX_512bit); + __ vpsrlq(T0, A1, 44, Assembler::AVX_512bit); + __ evpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ vpaddq(A2, A2, T0, Assembler::AVX_512bit); + __ vpsrlq(T0, A2, 42, Assembler::AVX_512bit); + __ evpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + __ vpsllq(T1, T0, 2, Assembler::AVX_512bit); + __ vpaddq(T0, T0, T1, Assembler::AVX_512bit); + __ vpaddq(A0, A0, T0, Assembler::AVX_512bit); // Put together A (accumulator) - movq(a0, A0); + __ movq(a0, A0); - movq(t0, A1); - movq(t1, t0); - shlq(t1, 44); - orq(a0, t1); + __ movq(t0, A1); + __ movq(t1, t0); + __ shlq(t1, 44); + __ orq(a0, t1); - shrq(t0, 20); - movq(a2, A2); - movq(a1, a2); - shlq(a1, 24); - orq(a1, t0); - shrq(a2, 40); + __ shrq(t0, 20); + __ movq(a2, A2); + __ movq(a1, a2); + __ shlq(a1, 24); + __ orq(a1, t0); + __ shrq(a2, 40); // Cleanup - vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); - vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); - vpxorq(T0, T0, T0, Assembler::AVX_512bit); - vpxorq(T1, T1, T1, Assembler::AVX_512bit); - vpxorq(T2, T2, T2, Assembler::AVX_512bit); - vpxorq(C0, C0, C0, Assembler::AVX_512bit); - vpxorq(C1, C1, C1, Assembler::AVX_512bit); - vpxorq(C2, C2, C2, Assembler::AVX_512bit); - vpxorq(A0, A0, A0, Assembler::AVX_512bit); - vpxorq(A1, A1, A1, Assembler::AVX_512bit); - vpxorq(A2, A2, A2, Assembler::AVX_512bit); - vpxorq(A3, A3, A3, Assembler::AVX_512bit); - vpxorq(A4, A4, A4, Assembler::AVX_512bit); - vpxorq(A5, A5, A5, Assembler::AVX_512bit); - vpxorq(B0, B0, B0, Assembler::AVX_512bit); - vpxorq(B1, B1, B1, Assembler::AVX_512bit); - vpxorq(B2, B2, B2, Assembler::AVX_512bit); - vpxorq(B3, B3, B3, Assembler::AVX_512bit); - vpxorq(B4, B4, B4, Assembler::AVX_512bit); - vpxorq(B5, B5, B5, Assembler::AVX_512bit); - vpxorq(R0, R0, R0, Assembler::AVX_512bit); - vpxorq(R1, R1, R1, Assembler::AVX_512bit); - vpxorq(R2, R2, R2, Assembler::AVX_512bit); - vpxorq(R1P, R1P, R1P, Assembler::AVX_512bit); - vpxorq(R2P, R2P, R2P, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*3), A0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*4), A0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*5), A0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*0), A0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*1), A0, Assembler::AVX_512bit); - evmovdquq(Address(rsp, 64*2), A0, Assembler::AVX_512bit); - addq(rsp, 512/8*6); // (powers of R) + __ vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); + __ vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); + __ vpxorq(T0, T0, T0, Assembler::AVX_512bit); + __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); + __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); + __ vpxorq(C0, C0, C0, Assembler::AVX_512bit); + __ vpxorq(C1, C1, C1, Assembler::AVX_512bit); + __ vpxorq(C2, C2, C2, Assembler::AVX_512bit); + __ vpxorq(A0, A0, A0, Assembler::AVX_512bit); + __ vpxorq(A1, A1, A1, Assembler::AVX_512bit); + __ vpxorq(A2, A2, A2, Assembler::AVX_512bit); + __ vpxorq(A3, A3, A3, Assembler::AVX_512bit); + __ vpxorq(A4, A4, A4, Assembler::AVX_512bit); + __ vpxorq(A5, A5, A5, Assembler::AVX_512bit); + __ vpxorq(B0, B0, B0, Assembler::AVX_512bit); + __ vpxorq(B1, B1, B1, Assembler::AVX_512bit); + __ vpxorq(B2, B2, B2, Assembler::AVX_512bit); + __ vpxorq(B3, B3, B3, Assembler::AVX_512bit); + __ vpxorq(B4, B4, B4, Assembler::AVX_512bit); + __ vpxorq(B5, B5, B5, Assembler::AVX_512bit); + __ vpxorq(R0, R0, R0, Assembler::AVX_512bit); + __ vpxorq(R1, R1, R1, Assembler::AVX_512bit); + __ vpxorq(R2, R2, R2, Assembler::AVX_512bit); + __ vpxorq(R1P, R1P, R1P, Assembler::AVX_512bit); + __ vpxorq(R2P, R2P, R2P, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*3), A0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*4), A0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*5), A0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*0), A0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*1), A0, Assembler::AVX_512bit); + __ evmovdquq(Address(rsp, 64*2), A0, Assembler::AVX_512bit); + __ addq(rsp, 512/8*6); // (powers of R) } // This function consumes as many whole 16-byte blocks as available in input // After execution, input and length will point at remaining (unprocessed) data // and accumulator will point to the current accumulator value -// -void MacroAssembler::poly1305_process_blocks(Register input, Register length, Register accumulator, Register R) -{ - // Register Map: - // input = rdi; - // length = rbx; - // accumulator = rcx; - // R = r8; +address StubGenerator::generate_poly1305_processBlocks() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks"); + address start = __ pc(); + __ enter(); + + // Save all 'SOE' registers + __ push(rbx); + #ifdef _WIN64 + __ push(rsi); + __ push(rdi); + #endif + __ push(r12); + __ push(r13); + __ push(r14); + __ push(r15); + + // void processBlocks(byte[] input, int len, int[5] a, int[5] r) + const Register input = rdi; //input+offset + const Register length = rbx; + const Register accumulator = rcx; + const Register R = r8; + + #ifdef _WIN64 + __ mov(input, c_rarg0); + __ mov(length, c_rarg1); + __ mov(accumulator, c_rarg2); + __ mov(R, c_rarg3); + #else // input already in correct position for linux; dont clobber R, args copied out-of-order + __ mov(length, c_rarg1); + __ mov(R, c_rarg3); + __ mov(accumulator, c_rarg2); + #endif const Register a0 = rsi; // [in/out] accumulator bits 63..0 const Register a1 = r9; // [in/out] accumulator bits 127..64 @@ -933,38 +978,50 @@ void MacroAssembler::poly1305_process_blocks(Register input, Register length, Re poly1305_limbs(R, r0, r1, r1, true); // Compute 5*R (Upper limb only) - movq(c1, r1); - shrq(c1, 2); - addq(c1, r1); // c1 = r1 + (r1 >> 2) + __ movq(c1, r1); + __ shrq(c1, 2); + __ addq(c1, r1); // c1 = r1 + (r1 >> 2) // Load accumulator into a2:a1:a0 poly1305_limbs(accumulator, a0, a1, a2, false); // VECTOR LOOP: Minimum of 256 bytes to run vectorized code - cmpl(length, 16*16); - jcc(Assembler::less, L_process16Loop); + __ cmpl(length, 16*16); + __ jcc(Assembler::less, L_process16Loop); poly1305_process_blocks_avx512(input, length, a0, a1, a2, r0, r1, c1); // SCALAR LOOP: process one 16-byte message block at a time - bind(L_process16Loop); - cmpl(length, 16); - jcc(Assembler::less, L_process16LoopDone); + __ bind(L_process16Loop); + __ cmpl(length, 16); + __ jcc(Assembler::less, L_process16LoopDone); - addq(a0, Address(input,0)); - adcq(a1, Address(input,8)); - adcq(a2,1); + __ addq(a0, Address(input,0)); + __ adcq(a1, Address(input,8)); + __ adcq(a2,1); poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); - subl(length, 16); - lea(input, Address(input,16)); - jmp(L_process16Loop); - bind(L_process16LoopDone); + __ subl(length, 16); + __ lea(input, Address(input,16)); + __ jmp(L_process16Loop); + __ bind(L_process16LoopDone); // Write output poly1305_limbs_out(a0, a1, a2, accumulator); -} -#endif // _LP64 + __ pop(r15); + __ pop(r14); + __ pop(r13); + __ pop(r12); + #ifdef _WIN64 + __ pop(rdi); + __ pop(rsi); + #endif + __ pop(rbx); + + __ leave(); + __ ret(0); + return start; +} \ No newline at end of file diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index e72d86010585f..ca2f7bc3cc88a 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -80,7 +80,6 @@ address StubRoutines::x86::_join_0_1_base64 = NULL; address StubRoutines::x86::_join_1_2_base64 = NULL; address StubRoutines::x86::_join_2_3_base64 = NULL; address StubRoutines::x86::_decoding_table_base64 = NULL; -address StubRoutines::x86::_poly1305_mask_addr = NULL; #endif address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL; diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.hpp b/src/hotspot/cpu/x86/stubRoutines_x86.hpp index abf82cfc31191..bb98fcf46cd37 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.hpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.hpp @@ -192,7 +192,6 @@ class x86 { static address _join_1_2_base64; static address _join_2_3_base64; static address _decoding_table_base64; - static address _poly1305_mask_addr; #endif // byte flip mask for sha256 static address _pshuffle_byte_flip_mask_addr; @@ -324,7 +323,6 @@ class x86 { static address base64_vbmi_join_1_2_addr() { return _join_1_2_base64; } static address base64_vbmi_join_2_3_addr() { return _join_2_3_base64; } static address base64_decoding_table_addr() { return _decoding_table_base64; } - static address poly1305_mask_addr() { return _poly1305_mask_addr;} #endif static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; } static void generate_CRC32C_table(bool is_pclmulqdq_supported); diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index a8281a4528422..e5e51eadac03d 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -7033,7 +7033,7 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { Node* r_start = array_element_address(rlimbs, intcon(0), T_LONG); assert(r_start, "r array is NULL"); - Node* call = make_runtime_call(RC_LEAF, + Node* call = make_runtime_call(RC_LEAF | RC_NO_FP, OptoRuntime::poly1305_processBlocks_Type(), stubAddr, stubName, TypePtr::BOTTOM, input_start, len, acc_start, r_start); From 120247d5b9afdd6af6c8c0b0850c9b8f57eaa4df Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 8 Nov 2022 16:36:10 -0500 Subject: [PATCH 10/23] make UsePolyIntrinsics option diagnostic --- src/hotspot/share/runtime/globals.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/runtime/globals.hpp b/src/hotspot/share/runtime/globals.hpp index 504740e896e83..c5750d6d68e83 100644 --- a/src/hotspot/share/runtime/globals.hpp +++ b/src/hotspot/share/runtime/globals.hpp @@ -238,7 +238,7 @@ const int ObjectAlignmentInBytes = 8; product(bool, UseBASE64Intrinsics, false, \ "Use intrinsics for java.util.Base64") \ \ - product(bool, UsePolyIntrinsics, false, \ + product(bool, UsePolyIntrinsics, false, DIAGNOSTIC, \ "Use intrinsics for sun.security.util.math.intpoly") \ \ product(size_t, LargePageSizeInBytes, 0, \ From da56045242a40c47ef3f1197c6fa5b5603e2bb10 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 8 Nov 2022 18:14:07 -0500 Subject: [PATCH 11/23] fix 32-bit build --- .../{stubGenerator_x86_poly.cpp => stubGenerator_x86_64_poly.cpp} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/hotspot/cpu/x86/{stubGenerator_x86_poly.cpp => stubGenerator_x86_64_poly.cpp} (100%) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp similarity index 100% rename from src/hotspot/cpu/x86/stubGenerator_x86_poly.cpp rename to src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp From 8b1b40f7a1d4db27bc5dbd4b4db7861baa87c9ee Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Wed, 9 Nov 2022 16:36:57 -0500 Subject: [PATCH 12/23] add getLimbs to interface and reviews --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 45 +++++++ src/hotspot/cpu/x86/macroAssembler_x86.hpp | 14 ++- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 4 +- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 114 +++++++++--------- src/hotspot/cpu/x86/stubRoutines_x86.cpp | 2 +- src/hotspot/cpu/x86/vm_version_x86.cpp | 24 ++-- src/hotspot/share/classfile/vmIntrinsics.hpp | 10 +- src/hotspot/share/opto/library_call.cpp | 43 +------ .../com/sun/crypto/provider/Poly1305.java | 21 +++- .../security/util/math/IntegerModuloP.java | 5 +- .../util/math/intpoly/IntegerPolynomial.java | 6 +- 11 files changed, 156 insertions(+), 132 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index aade92a2aba5f..e2d73b635ea3f 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -1217,6 +1217,17 @@ void MacroAssembler::andptr(Register dst, int32_t imm32) { LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); } +void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + andq(dst, as_Address(src)); + } else { + lea(rscratch, src); + andq(dst, Address(rscratch, 0)); + } +} + void MacroAssembler::atomic_incl(Address counter_addr) { lock(); incrementl(counter_addr); @@ -9105,6 +9116,40 @@ void MacroAssembler::evrord(BasicType type, XMMRegister dst, KRegister mask, XMM fatal("Unexpected type argument %s", type2name(type)); break; } } + +void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + evpandq(dst, nds, as_Address(src), vector_len); + } else { + lea(rscratch, src); + evpandq(dst, nds, Address(rscratch, 0), vector_len); + } +} + +void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src), "missing"); + + if (reachable(src)) { + evporq(dst, nds, as_Address(src), vector_len); + } else { + lea(rscratch, src); + evporq(dst, nds, Address(rscratch, 0), vector_len); + } +} + +void MacroAssembler::vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch) { + assert(rscratch != noreg || always_reachable(src3), "missing"); + + if (reachable(src3)) { + vpternlogq(dst, imm8, src2, as_Address(src3), vector_len); + } else { + lea(rscratch, src3); + vpternlogq(dst, imm8, src2, Address(rscratch, 0), vector_len); + } +} + #if COMPILER2_OR_JVMCI void MacroAssembler::fill_masked(BasicType bt, Address dst, XMMRegister xmm, KRegister mask, diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 5cd0efadac561..9ddf5f35f5599 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -730,6 +730,9 @@ class MacroAssembler: public Assembler { void andptr(Register dst, int32_t src); void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; } + using Assembler::andq; + void andq(Register dst, AddressLiteral src, Register rscratch = noreg); + void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg); // renamed to drag out the casting of address to int32_t/intptr_t @@ -967,7 +970,6 @@ class MacroAssembler: public Assembler { Register g, Register h, int iteration); void addmq(int disp, Register r1, Register r2); - public: void sha256_AVX2(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0, XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, @@ -977,7 +979,6 @@ class MacroAssembler: public Assembler { XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4, Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block, XMMRegister shuf_mask); - #endif // _LP64 void fast_md5(Register buf, Address state, Address ofs, Address limit, @@ -1756,6 +1757,15 @@ class MacroAssembler: public Assembler { void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src, int shift, bool merge, int vlen_enc); void evrord(BasicType type, XMMRegister dst, KRegister mask, XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc); + using Assembler::evpandq; + void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); + + using Assembler::evporq; + void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg); + + using Assembler::vpternlogq; + void vpternlogq(XMMRegister dst, int imm8, XMMRegister src2, AddressLiteral src3, int vector_len, Register rscratch = noreg); + void alltrue(Register dst, uint masklen, KRegister src1, KRegister src2, KRegister kscratch); void anytrue(Register dst, uint masklen, KRegister src, KRegister kscratch); diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index c4521dda09562..51fa435996544 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -395,11 +395,11 @@ class StubGenerator: public StubCodeGenerator { void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, const Register R0, const Register R1, const Register C1, bool only128); void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP); + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P); void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128); void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs); void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, - const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP); + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 68e5a00236dc2..35b5aaf9003fc 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -78,33 +78,36 @@ // B: xmm19-24 // R: xmm25-29 -// Constant Pool Offsets: -enum polyCPOffset { - high_bit = 0, - mask_44 = 64, - mask_42 = 128, -}; -ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_CP[] = { - // OFFSET 0: high_bit +// Constant Pool: +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_PAD_MSG[] = { 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, 0x0000010000000000, +}; +static address poly1305_pad_msg() { + return (address)POLY1305_PAD_MSG; +} +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK42[] = { + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff, + 0x000003ffffffffff, 0x000003ffffffffff +}; +static address poly1305_mask42() { + return (address)POLY1305_MASK42; +} + +ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK44[] = { // OFFSET 64: mask_44 - 0xfffffffffff, 0xfffffffffff, - 0xfffffffffff, 0xfffffffffff, - 0xfffffffffff, 0xfffffffffff, - 0xfffffffffff, 0xfffffffffff, - - // OFFSET 128: mask_42 - 0x3ffffffffff, 0x3ffffffffff, - 0x3ffffffffff, 0x3ffffffffff, - 0x3ffffffffff, 0x3ffffffffff, - 0x3ffffffffff, 0x3ffffffffff + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, + 0x00000fffffffffff, 0x00000fffffffffff, }; -static address poly1305_mask_addr() { - return (address)POLY1305_CP; +static address poly1305_mask44() { + return (address)POLY1305_MASK44; } // Compute product for 8 16-byte message blocks, @@ -146,7 +149,7 @@ static address poly1305_mask_addr() { // void StubGenerator::poly1305_multiply8_avx512( const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, const Register polyCP) + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P) { const XMMRegister P0_L = xmm0; const XMMRegister P0_H = xmm1; @@ -155,6 +158,7 @@ void StubGenerator::poly1305_multiply8_avx512( const XMMRegister P2_L = xmm4; const XMMRegister P2_H = xmm5; const XMMRegister TMP1 = xmm6; + const Register polyCP = r13; // Reset partial sums __ evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); @@ -194,18 +198,18 @@ void StubGenerator::poly1305_multiply8_avx512( // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // __ vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); - __ evpandq(A0, P0_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ evpandq(A0, P0_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits __ vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); __ vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); __ vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); - __ evpandq(A1, P1_L, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ evpandq(A1, P1_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits __ vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); __ vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); __ vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); __ vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); - __ evpandq(A2, P2_L, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + __ evpandq(A2, P2_L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, polyCP); // Clear top 22 bits __ vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); __ vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); @@ -217,7 +221,7 @@ void StubGenerator::poly1305_multiply8_avx512( __ vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); __ vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); - __ evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); __ vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); } @@ -345,10 +349,12 @@ void StubGenerator::poly1305_multiply_scalar( // void StubGenerator::poly1305_limbs_avx512( const XMMRegister D0, const XMMRegister D1, - const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, const Register polyCP) + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG) { const XMMRegister TMP1 = xmm0; const XMMRegister TMP2 = xmm1; + const Register polyCP = r13; + // Interleave blocks of data __ evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); @@ -356,16 +362,16 @@ void StubGenerator::poly1305_limbs_avx512( // Highest 42-bit limbs of new blocks __ vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); if (padMSG) { - __ evporq(L2, L2, Address(polyCP, high_bit), Assembler::AVX_512bit); // Add 2^128 to all 8 final qwords of the message + __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, polyCP); // Add 2^128 to all 8 final qwords of the message } // Middle 44-bit limbs of new blocks __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit); __ vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); - __ vpternlogq(L1, 0xA8, TMP2, Address(polyCP, mask_44), Assembler::AVX_512bit); // (A OR B AND C) + __ vpternlogq(L1, 0xA8, TMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // (A OR B AND C) // Lowest 44-bit limbs of new blocks - __ evpandq(L0, L0, Address(polyCP, mask_44), Assembler::AVX_512bit); + __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); } /** @@ -591,20 +597,19 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R const XMMRegister R2P = xmm29; __ subq(rsp, 512/8*6); // Make room to store 6 zmm registers (powers of R) - __ lea(polyCP, ExternalAddress(poly1305_mask_addr())); // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 __ movq(t0, a0); - __ andq(t0, Address(polyCP, mask_44)); // First limb (Acc[43:0]) + __ andq(t0, ExternalAddress(poly1305_mask44()), polyCP); // First limb (Acc[43:0]) __ movq(C0, t0); __ movq(t0, a1); __ shrdq(a0, t0, 44); - __ andq(a0, Address(polyCP, mask_44)); // Second limb (Acc[77:52]) + __ andq(a0, ExternalAddress(poly1305_mask44()), polyCP); // Second limb (Acc[77:52]) __ movq(C1, a0); __ shrdq(a1, a2, 24); - __ andq(a1, Address(polyCP, mask_42)); // Third limb (Acc[129:88]) + __ andq(a1, ExternalAddress(poly1305_mask42()), polyCP); // Third limb (Acc[129:88]) __ movq(C2, a1); // To add accumulator, we must unroll first loop iteration @@ -615,7 +620,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // A2 to have bits 127-88 of all 8 blocks in 8 qwords __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A0, A1, A2, true, polyCP); + poly1305_limbs_avx512(T0, T1, A0, A1, A2, true); // Add accumulator to the fist message block __ vpaddq(A0, A0, C0, Assembler::AVX_512bit); @@ -628,7 +633,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // A5 to have bits 127-88 of all 8 blocks in 8 qwords __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, polyCP); + poly1305_limbs_avx512(T0, T1, A3, A4, A5, true); __ subl(length, 16*16); __ lea(input, Address(input,16*16)); @@ -675,9 +680,8 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords - __ lea(polyCP, ExternalAddress(poly1305_mask_addr())); __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, polyCP); + poly1305_limbs_avx512(T0, T2, B0, B1, B2, false); // T1 contains the 2 highest bits of the powers of R __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); @@ -685,16 +689,16 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Broadcast 44-bit limbs of R^4 into R0,R1,R2 __ mov(t0, a0); - __ andq(t0, Address(polyCP, mask_44)); // First limb (R^4[43:0]) + __ andq(t0, ExternalAddress(poly1305_mask44()), polyCP); // First limb (R^4[43:0]) __ evpbroadcastq(R0, t0, Assembler::AVX_512bit); __ movq(t0, a1); __ shrdq(a0, t0, 44); - __ andq(a0, Address(polyCP, mask_44)); // Second limb (R^4[87:44]) + __ andq(a0, ExternalAddress(poly1305_mask44()), polyCP); // Second limb (R^4[87:44]) __ evpbroadcastq(R1, a0, Assembler::AVX_512bit); __ shrdq(a1, a2, 24); - __ andq(a1, Address(polyCP, mask_42)); // Third limb (R^4[129:88]) + __ andq(a1, ExternalAddress(poly1305_mask42()), polyCP); // Third limb (R^4[129:88]) __ evpbroadcastq(R2, a1, Assembler::AVX_512bit); // Generate 4*5*R^4 into {R2P,R1P} @@ -714,8 +718,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Calculate R^8-R^5 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 - R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4 - polyCP); + R0, R1, R2, R1P, R2P); // R^4..R^4, 4*5*R^4 // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R __ evporq(B0, B0, C0, Assembler::AVX_512bit); @@ -741,9 +744,8 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R __ evmovdquq(Address(rsp, 64*2), B2, Assembler::AVX_512bit); // Calculate R^16-R^9 - poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 - R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8 - polyCP); + poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 + R0, R1, R2, R1P, R2P); // R^8..R^8, 4*5*R^8 // Store R^16-R^9 for later use __ evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); @@ -771,19 +773,17 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Load and interleave next block of data (128 bytes) __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B0, B1, B2, true, polyCP); + poly1305_limbs_avx512(T0, T1, B0, B1, B2, true); // Load and interleave next block of data (128 bytes) __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B3, B4, B5, true, polyCP); + poly1305_limbs_avx512(T0, T1, B3, B4, B5, true); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P, //R^16..R^16, 4*5*R^16 - polyCP); + R0, R1, R2, R1P, R2P); //R^16..R^16, 4*5*R^16 poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P, //R^16..R^16, 4*5*R^16 - polyCP); + R0, R1, R2, R1P, R2P); //R^16..R^16, 4*5*R^16 __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator @@ -825,11 +825,9 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - B0, B1, B2, B3, B4, // R^16-R^9, R1P, R2P - polyCP); + B0, B1, B2, B3, B4); // R^16-R^9, R1P, R2P poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P, // R^8-R, R1P, R2P - polyCP); + R0, R1, R2, R1P, R2P); // R^8-R, R1P, R2P // Add all blocks (horizontally) // 16->8 blocks @@ -867,13 +865,13 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Carry propagation __ vpsrlq(T0, A0, 44, Assembler::AVX_512bit); - __ evpandq(A0, A0, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits __ vpaddq(A1, A1, T0, Assembler::AVX_512bit); __ vpsrlq(T0, A1, 44, Assembler::AVX_512bit); - __ evpandq(A1, A1, Address(polyCP, mask_44), Assembler::AVX_512bit); // Clear top 20 bits + __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits __ vpaddq(A2, A2, T0, Assembler::AVX_512bit); __ vpsrlq(T0, A2, 42, Assembler::AVX_512bit); - __ evpandq(A2, A2, Address(polyCP, mask_42), Assembler::AVX_512bit); // Clear top 22 bits + __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, polyCP); // Clear top 22 bits __ vpsllq(T1, T0, 2, Assembler::AVX_512bit); __ vpaddq(T0, T0, T1, Assembler::AVX_512bit); __ vpaddq(A0, A0, T0, Assembler::AVX_512bit); diff --git a/src/hotspot/cpu/x86/stubRoutines_x86.cpp b/src/hotspot/cpu/x86/stubRoutines_x86.cpp index ca2f7bc3cc88a..8f285115538e2 100644 --- a/src/hotspot/cpu/x86/stubRoutines_x86.cpp +++ b/src/hotspot/cpu/x86/stubRoutines_x86.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 9b0a1212a8d6f..09515c30bbdd8 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1176,18 +1176,6 @@ void VM_Version::get_processor_features() { FLAG_SET_DEFAULT(UseSHA256Intrinsics, false); } -#ifdef _LP64 - if (supports_avx512ifma() & supports_avx512vlbw()) { - if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { - FLAG_SET_DEFAULT(UsePolyIntrinsics, true); - } - } else -#endif - if (UsePolyIntrinsics) { - warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU."); - FLAG_SET_DEFAULT(UsePolyIntrinsics, false); - } - #ifdef _LP64 // These are only supported on 64-bit if (UseSHA && supports_avx2() && supports_bmi2()) { @@ -1347,6 +1335,18 @@ void VM_Version::get_processor_features() { } #endif // COMPILER2 && ASSERT +#ifdef _LP64 + if (supports_avx512ifma() & supports_avx512vlbw() & MaxVectorSize >= 64) { + if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { + FLAG_SET_DEFAULT(UsePolyIntrinsics, true); + } + } else +#endif + if (UsePolyIntrinsics) { + warning("Intrinsics for Poly1305 crypto hash functions not available on this CPU."); + FLAG_SET_DEFAULT(UsePolyIntrinsics, false); + } + #ifdef _LP64 if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { UseMultiplyToLenIntrinsic = true; diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index a862f01b118cd..9b0cd3f366f67 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -521,17 +521,17 @@ class methodHandle; do_name(decodeBlock_name, "decodeBlock") \ do_signature(decodeBlock_signature, "([BII[BIZZ)I") \ \ - /* support for com.sun.crypto.provider.Poly1305 */ \ - do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \ - do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, putCharStringU_signature, F_R) \ - do_name(processMultipleBlocks_name, "processMultipleBlocks") \ - \ /* support for com.sun.crypto.provider.GHASH */ \ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \ do_name(processBlocks_name, "processBlocks") \ do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \ \ + /* support for com.sun.crypto.provider.Poly1305 */ \ + do_class(com_sun_crypto_provider_Poly1305, "com/sun/crypto/provider/Poly1305") \ + do_intrinsic(_poly1305_processBlocks, com_sun_crypto_provider_Poly1305, processMultipleBlocks_name, ghash_processBlocks_signature, F_R) \ + do_name(processMultipleBlocks_name, "processMultipleBlocks") \ + \ /* support for java.util.zip */ \ do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \ do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \ diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index e5e51eadac03d..f3ef66d0774e2 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -6973,7 +6973,7 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { address stubAddr; const char *stubName; assert(UsePolyIntrinsics, "need Poly intrinsics support"); - assert(callee()->signature()->size() == 3, "poly1305_processBlocks has %d parameters", callee()->signature()->size()); + assert(callee()->signature()->size() == 5, "poly1305_processBlocks has %d parameters", callee()->signature()->size()); stubAddr = StubRoutines::poly1305_processBlocks(); stubName = "poly1305_processBlocks"; @@ -6982,50 +6982,13 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { Node* input = argument(1); Node* input_offset = argument(2); Node* len = argument(3); - - Node* accFace = load_field_from_object(polyObj, "a", "Lsun/security/util/math/MutableIntegerModuloP;"); - assert(accFace != NULL, "Accumulator field is null"); - const TypeInstPtr* ainst = _gvn.type(accFace)->isa_instptr(); - assert(ainst != NULL, "Accumulator obj is null"); - assert(ainst->is_loaded(), "MutableIntegerModuloP obj is not loaded"); - ciKlass* klass_MutableElement = ainst->instance_klass()->find_klass(ciSymbol::make("sun/security/util/math/intpoly/IntegerPolynomial$MutableElement")); - assert(klass_MutableElement != NULL, "IntegerPolynomial$MutableElement class is null"); - assert(klass_MutableElement->is_loaded(), "IntegerPolynomial$MutableElement class is not loaded"); - ciInstanceKlass* instklass_MutableElement = klass_MutableElement->as_instance_klass(); - - const TypeKlassPtr* aklass = TypeKlassPtr::make(instklass_MutableElement); - const TypeOopPtr* atype = aklass->as_instance_type()->cast_to_ptr_type(TypePtr::NotNull); - Node* accObj = new CheckCastPPNode(control(), accFace, atype); - accObj = _gvn.transform(accObj); - Node* alimbs = load_field_from_object(accObj, "limbs", "[J"); - - Node* rFace = load_field_from_object(polyObj, "r", "Lsun/security/util/math/IntegerModuloP;"); //this.r.limbs - assert(rFace != NULL, "R field is null"); - const TypeInstPtr* rinst = _gvn.type(rFace)->isa_instptr(); - assert(rinst != NULL, "R obj is null"); - assert(rinst->is_loaded(), "IntegerModuloP obj is not loaded"); - ciKlass* klass_ImmutableElement = rinst->instance_klass()->find_klass(ciSymbol::make("sun/security/util/math/intpoly/IntegerPolynomial$ImmutableElement")); - assert(klass_ImmutableElement != NULL, "IntegerPolynomial$ImmutableElement class is null"); - assert(klass_ImmutableElement->is_loaded(), "IntegerPolynomial$ImmutableElement class is not loaded"); - ciInstanceKlass* instklass_ImmutableElement = klass_ImmutableElement->as_instance_klass(); - - const TypeKlassPtr* rklass = TypeKlassPtr::make(instklass_ImmutableElement); - const TypeOopPtr* rtype = rklass->as_instance_type()->cast_to_ptr_type(TypePtr::NotNull); - Node* rObj = new CheckCastPPNode(control(), rFace, rtype); - rObj = _gvn.transform(rObj); - Node* rlimbs = load_field_from_object(rObj, "limbs", "[J"); + Node* alimbs = argument(4); + Node* rlimbs = argument(5); input = must_be_not_null(input, true); alimbs = must_be_not_null(alimbs, true); rlimbs = must_be_not_null(rlimbs, true); - // Intrinsic assumes there are exactly 5 limbs! Currently enforced by IntegerModuloP.checkLimbsForIntrinsic - // FIXME: where to branch to if limbs array length != 5? Could be an 'assert'/RuntimeException - // FIXME: repeat for rlimbs - // Node* cmp = _gvn.transform(new CmpINode(load_array_length(alimbs), intcon(5))); - // Node* bol = _gvn.transform(new BoolNode(cmp, BoolTest::eq)); - // Node* if_eq = generate_slow_guard(bol, slow_region); - Node* input_start = array_element_address(input, input_offset, T_BYTE); assert(input_start, "input array is NULL"); Node* acc_start = array_element_address(alimbs, intcon(0), T_LONG); diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index c335588ff8633..1c8e5a4f75117 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -172,10 +172,10 @@ void engineUpdate(byte[] input, int offset, int len) { } int blockMultipleLength = len & (~(BLOCK_LENGTH-1)); - Objects.checkFromIndexSize(offset, blockMultipleLength, input.length); - a.checkLimbsForIntrinsic(); - r.checkLimbsForIntrinsic(); - processMultipleBlocks(input, offset, blockMultipleLength); + long[] aLimbs = a.getLimbs(); + long[] rLimbs = r.getLimbs(); + processMultipleBlocksCheck(input, offset, blockMultipleLength, aLimbs, rLimbs); + processMultipleBlocks(input, offset, blockMultipleLength, aLimbs, rLimbs); offset += blockMultipleLength; len -= blockMultipleLength; @@ -246,7 +246,7 @@ private void processBlock(byte[] block, int offset, int length) { @ForceInline @IntrinsicCandidate - private void processMultipleBlocks(byte[] input, int offset, int length) { + private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { while (length >= BLOCK_LENGTH) { n.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); a.setSum(n); // A += (temp | 0x01) @@ -256,6 +256,17 @@ private void processMultipleBlocks(byte[] input, int offset, int length) { } } + private static void processMultipleBlocksCheck(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { + Objects.checkFromIndexSize(offset, length, input.length); + final int numLimbs = 5; // Intrinsic expects exactly 5 limbs + if (aLimbs.length != numLimbs) { + throw new RuntimeException("invalid accumulator length: " + aLimbs.length); + } + if (rLimbs.length != numLimbs) { + throw new RuntimeException("invalid R length: " + rLimbs.length); + } + } + /** * Partition the authentication key into the R and S components, clamp * the R value, and instantiate IntegerModuloP objects to R and S's diff --git a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java index 996c6d795eadd..6131f2c4b4632 100644 --- a/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java +++ b/src/java.base/share/classes/sun/security/util/math/IntegerModuloP.java @@ -209,9 +209,8 @@ default ImmutableIntegerModuloP pow(BigInteger b) { } /** - * Enforce java to IntrinsicCandidate 'contract' - * @throws IndexOutOfBoundsException if the check fails + * Break encapsulation, used for IntrinsicCandidate functions */ - void checkLimbsForIntrinsic(); + long[] getLimbs(); } diff --git a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java index 43c67dfc136b7..693d88bcc76af 100644 --- a/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java +++ b/src/java.base/share/classes/sun/security/util/math/intpoly/IntegerPolynomial.java @@ -32,8 +32,6 @@ import java.nio.ByteOrder; import java.util.Arrays; -import jdk.internal.util.Preconditions; - /** * A large number polynomial representation using sparse limbs of signed * long (64-bit) values. Limb values will always fit within a long, so inputs @@ -629,8 +627,8 @@ public void asByteArray(byte[] result) { limbsToByteArray(limbs, result); } - public void checkLimbsForIntrinsic() { - Preconditions.checkFromIndexSize(0, numLimbs, limbs.length, null); + public long[] getLimbs() { + return limbs; } } From abfc68f49fad81d2d34ef2e4447d4f198585d908 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Wed, 9 Nov 2022 20:17:35 -0500 Subject: [PATCH 13/23] fix windows and 32b linux builds --- src/hotspot/cpu/x86/macroAssembler_x86.cpp | 2 ++ src/hotspot/cpu/x86/macroAssembler_x86.hpp | 2 ++ src/hotspot/cpu/x86/vm_version_x86.cpp | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.cpp b/src/hotspot/cpu/x86/macroAssembler_x86.cpp index e2d73b635ea3f..3026109d979dc 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp @@ -1217,6 +1217,7 @@ void MacroAssembler::andptr(Register dst, int32_t imm32) { LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); } +#ifdef _LP64 void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { assert(rscratch != noreg || always_reachable(src), "missing"); @@ -1227,6 +1228,7 @@ void MacroAssembler::andq(Register dst, AddressLiteral src, Register rscratch) { andq(dst, Address(rscratch, 0)); } } +#endif void MacroAssembler::atomic_incl(Address counter_addr) { lock(); diff --git a/src/hotspot/cpu/x86/macroAssembler_x86.hpp b/src/hotspot/cpu/x86/macroAssembler_x86.hpp index 9ddf5f35f5599..5a0a3d8c9a1ee 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86.hpp @@ -730,8 +730,10 @@ class MacroAssembler: public Assembler { void andptr(Register dst, int32_t src); void andptr(Register src1, Register src2) { LP64_ONLY(andq(src1, src2)) NOT_LP64(andl(src1, src2)) ; } +#ifdef _LP64 using Assembler::andq; void andq(Register dst, AddressLiteral src, Register rscratch = noreg); +#endif void cmp8(AddressLiteral src1, int imm, Register rscratch = noreg); diff --git a/src/hotspot/cpu/x86/vm_version_x86.cpp b/src/hotspot/cpu/x86/vm_version_x86.cpp index 09515c30bbdd8..d81a488387284 100644 --- a/src/hotspot/cpu/x86/vm_version_x86.cpp +++ b/src/hotspot/cpu/x86/vm_version_x86.cpp @@ -1336,7 +1336,7 @@ void VM_Version::get_processor_features() { #endif // COMPILER2 && ASSERT #ifdef _LP64 - if (supports_avx512ifma() & supports_avx512vlbw() & MaxVectorSize >= 64) { + if (supports_avx512ifma() && supports_avx512vlbw() && MaxVectorSize >= 64) { if (FLAG_IS_DEFAULT(UsePolyIntrinsics)) { FLAG_SET_DEFAULT(UsePolyIntrinsics, true); } From 2176caf8809ed32fbe8facb90d98738abcb928b4 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 10 Nov 2022 17:39:47 -0500 Subject: [PATCH 14/23] Sandhya's review --- src/hotspot/share/opto/library_call.cpp | 1 - .../share/classes/com/sun/crypto/provider/Poly1305.java | 7 ++++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index f3ef66d0774e2..0f6a7fd3beaca 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -6978,7 +6978,6 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { stubName = "poly1305_processBlocks"; if (!stubAddr) return false; - Node* polyObj = argument(0); Node* input = argument(1); Node* input_offset = argument(2); Node* len = argument(3); diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 1c8e5a4f75117..01c7efd3861bd 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -25,7 +25,6 @@ package com.sun.crypto.provider; -import java.lang.reflect.Field; import java.nio.ByteBuffer; import java.security.Key; import java.security.InvalidKeyException; @@ -244,13 +243,15 @@ private void processBlock(byte[] block, int offset, int length) { a.setProduct(r); // a = (a * r) % p } + // This is an intrinsified method. The unused parameters aLimbs and rLimbs are used by the intrinsic. + // They correspond to this.a and this.r respectively @ForceInline @IntrinsicCandidate private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { while (length >= BLOCK_LENGTH) { n.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); - a.setSum(n); // A += (temp | 0x01) - a.setProduct(r); // A = (A * R) % p + a.setSum(n); // a += (n | 0x01) + a.setProduct(r); // a = (a * r) % p offset += BLOCK_LENGTH; length -= BLOCK_LENGTH; } From 196ee35b048fac21b018225ecaec4af4212b2ba6 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 10 Nov 2022 17:56:18 -0500 Subject: [PATCH 15/23] jcheck --- .../share/classes/com/sun/crypto/provider/Poly1305.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 01c7efd3861bd..582c2871c51f3 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -244,7 +244,7 @@ private void processBlock(byte[] block, int offset, int length) { } // This is an intrinsified method. The unused parameters aLimbs and rLimbs are used by the intrinsic. - // They correspond to this.a and this.r respectively + // They correspond to this.a and this.r respectively @ForceInline @IntrinsicCandidate private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { From 835fbe3a29b198699e26292d0ccadf306264893b Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 10 Nov 2022 20:08:41 -0500 Subject: [PATCH 16/23] live review with Sandhya --- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 33 ++++++++++++------- .../unittest/Poly1305UnitTestDriver.java | 8 ++--- .../provider/Poly1305IntrinsicFuzzTest.java | 1 + 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 35b5aaf9003fc..997c7b69b10b4 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -596,7 +596,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R const XMMRegister R1P = xmm28; const XMMRegister R2P = xmm29; - __ subq(rsp, 512/8*6); // Make room to store 6 zmm registers (powers of R) + __ subq(rsp, (512/8)*6); // Make room to store 6 zmm registers (powers of R) // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 __ movq(t0, a0); @@ -781,13 +781,13 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R poly1305_limbs_avx512(T0, T1, B3, B4, B5, true); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); //R^16..R^16, 4*5*R^16 + R0, R1, R2, R1P, R2P); // R^16..R^16, 4*5*R^16 poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); //R^16..R^16, 4*5*R^16 + R0, R1, R2, R1P, R2P); // R^16..R^16, 4*5*R^16 __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator - __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); //Add highest bits from new blocks to accumulator + __ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator __ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator __ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator __ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator @@ -825,9 +825,9 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - B0, B1, B2, B3, B4); // R^16-R^9, R1P, R2P - poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); // R^8-R, R1P, R2P + B0, B1, B2, B3, B4); // R^16-R^9, R1P, R2P + poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks + R0, R1, R2, R1P, R2P); // R^8-R, R1P, R2P // Add all blocks (horizontally) // 16->8 blocks @@ -882,15 +882,17 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R __ movq(t0, A1); __ movq(t1, t0); __ shlq(t1, 44); - __ orq(a0, t1); - __ shrq(t0, 20); + __ movq(a2, A2); __ movq(a1, a2); __ shlq(a1, 24); - __ orq(a1, t0); __ shrq(a2, 40); + __ addq(a0, t1); + __ adcq(a1, t0); + __ adcq(a2, 0); + // Cleanup __ vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); __ vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); @@ -953,11 +955,20 @@ address StubGenerator::generate_poly1305_processBlocks() { const Register R = r8; #ifdef _WIN64 + // c_rarg0 - rcx + // c_rarg1 - rdx + // c_rarg2 - r8 + // c_rarg3 - r9 __ mov(input, c_rarg0); __ mov(length, c_rarg1); __ mov(accumulator, c_rarg2); __ mov(R, c_rarg3); - #else // input already in correct position for linux; dont clobber R, args copied out-of-order + #else + // c_rarg0 - rdi + // c_rarg1 - rsi + // c_rarg2 - rdx + // c_rarg3 - rcx + // dont clobber R, args copied out-of-order __ mov(length, c_rarg1); __ mov(R, c_rarg3); __ mov(accumulator, c_rarg2); diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java index b8bc22c244af5..34bb118155f32 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/Poly1305UnitTestDriver.java @@ -31,6 +31,7 @@ /* * @test + * @key randomness * @modules java.base/com.sun.crypto.provider * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest * @summary Unit test for com.sun.crypto.provider.Poly1305. @@ -45,18 +46,17 @@ /* * @test + * @key randomness * @modules java.base/com.sun.crypto.provider - * @run main java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. - * @run main/othervm -Xcomp -XX:-TieredCompilation com.sun.crypto.provider.Cipher.ChaCha20.Poly1305UnitTestDriver + * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305IntrinsicFuzzTest */ /* * @test * @modules java.base/com.sun.crypto.provider - * @run main java.base/com.sun.crypto.provider.Poly1305KAT * @summary Unit test for IntrinsicCandidate in com.sun.crypto.provider.Poly1305. - * @run main/othervm -Xcomp -XX:-TieredCompilation com.sun.crypto.provider.Cipher.ChaCha20.Poly1305UnitTestDriver + * @run main/othervm -Xcomp -XX:-TieredCompilation java.base/com.sun.crypto.provider.Poly1305KAT */ package com.sun.crypto.provider.Cipher.ChaCha20; diff --git a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java index aae840b011af6..3e7ecbad62e0a 100644 --- a/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java +++ b/test/jdk/com/sun/crypto/provider/Cipher/ChaCha20/unittest/java.base/com/sun/crypto/provider/Poly1305IntrinsicFuzzTest.java @@ -40,6 +40,7 @@ public static void main(String[] args) throws Exception { for (int i = 0; i < repeat; i++) { run(); } + System.out.println("Fuzz Success"); } public static void run() throws Exception { From 2a225e42aee562ce697190bbcb48e2a48fa0e82f Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Fri, 11 Nov 2022 12:52:15 -0500 Subject: [PATCH 17/23] Vladimir's review --- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 40 +++++++++---------- .../com/sun/crypto/provider/Poly1305.java | 4 +- 2 files changed, 21 insertions(+), 23 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 997c7b69b10b4..afe45ac820adc 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -68,7 +68,7 @@ // t2 = r14 // t3 = r15 // t0 = r14 -// polyCP = r13 +// rscratch = r13 // stack(rsp, rbp) // imul(rax, rdx) // ZMMs: @@ -158,7 +158,7 @@ void StubGenerator::poly1305_multiply8_avx512( const XMMRegister P2_L = xmm4; const XMMRegister P2_H = xmm5; const XMMRegister TMP1 = xmm6; - const Register polyCP = r13; + const Register rscratch = r13; // Reset partial sums __ evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); @@ -198,18 +198,18 @@ void StubGenerator::poly1305_multiply8_avx512( // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // __ vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); - __ evpandq(A0, P0_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits + __ evpandq(A0, P0_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits __ vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); __ vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); __ vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); - __ evpandq(A1, P1_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits + __ evpandq(A1, P1_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits __ vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); __ vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); __ vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); __ vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); - __ evpandq(A2, P2_L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, polyCP); // Clear top 22 bits + __ evpandq(A2, P2_L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits __ vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); __ vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); @@ -221,7 +221,7 @@ void StubGenerator::poly1305_multiply8_avx512( __ vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); __ vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); - __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); __ vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); } @@ -353,7 +353,7 @@ void StubGenerator::poly1305_limbs_avx512( { const XMMRegister TMP1 = xmm0; const XMMRegister TMP2 = xmm1; - const Register polyCP = r13; + const Register rscratch = r13; // Interleave blocks of data __ evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); @@ -362,16 +362,16 @@ void StubGenerator::poly1305_limbs_avx512( // Highest 42-bit limbs of new blocks __ vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); if (padMSG) { - __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, polyCP); // Add 2^128 to all 8 final qwords of the message + __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message } // Middle 44-bit limbs of new blocks __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit); __ vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); - __ vpternlogq(L1, 0xA8, TMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // (A OR B AND C) + __ vpternlogq(L1, 0xA8, TMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C) // Lowest 44-bit limbs of new blocks - __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); + __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); } /** @@ -564,7 +564,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // poly1305_multiply_scalar clobbers: r13-r15, rax, rdx const Register t0 = r14; const Register t1 = r13; - const Register polyCP = r13; + const Register rscratch = r13; // poly1305_limbs_avx512 clobbers: xmm0, xmm1 // poly1305_multiply8_avx512 clobbers: xmm0-xmm6 @@ -600,16 +600,16 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 __ movq(t0, a0); - __ andq(t0, ExternalAddress(poly1305_mask44()), polyCP); // First limb (Acc[43:0]) + __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (Acc[43:0]) __ movq(C0, t0); __ movq(t0, a1); __ shrdq(a0, t0, 44); - __ andq(a0, ExternalAddress(poly1305_mask44()), polyCP); // Second limb (Acc[77:52]) + __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (Acc[77:52]) __ movq(C1, a0); __ shrdq(a1, a2, 24); - __ andq(a1, ExternalAddress(poly1305_mask42()), polyCP); // Third limb (Acc[129:88]) + __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (Acc[129:88]) __ movq(C2, a1); // To add accumulator, we must unroll first loop iteration @@ -689,16 +689,16 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Broadcast 44-bit limbs of R^4 into R0,R1,R2 __ mov(t0, a0); - __ andq(t0, ExternalAddress(poly1305_mask44()), polyCP); // First limb (R^4[43:0]) + __ andq(t0, ExternalAddress(poly1305_mask44()), rscratch); // First limb (R^4[43:0]) __ evpbroadcastq(R0, t0, Assembler::AVX_512bit); __ movq(t0, a1); __ shrdq(a0, t0, 44); - __ andq(a0, ExternalAddress(poly1305_mask44()), polyCP); // Second limb (R^4[87:44]) + __ andq(a0, ExternalAddress(poly1305_mask44()), rscratch); // Second limb (R^4[87:44]) __ evpbroadcastq(R1, a0, Assembler::AVX_512bit); __ shrdq(a1, a2, 24); - __ andq(a1, ExternalAddress(poly1305_mask42()), polyCP); // Third limb (R^4[129:88]) + __ andq(a1, ExternalAddress(poly1305_mask42()), rscratch); // Third limb (R^4[129:88]) __ evpbroadcastq(R2, a1, Assembler::AVX_512bit); // Generate 4*5*R^4 into {R2P,R1P} @@ -865,13 +865,13 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Carry propagation __ vpsrlq(T0, A0, 44, Assembler::AVX_512bit); - __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits + __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits __ vpaddq(A1, A1, T0, Assembler::AVX_512bit); __ vpsrlq(T0, A1, 44, Assembler::AVX_512bit); - __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, polyCP); // Clear top 20 bits + __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits __ vpaddq(A2, A2, T0, Assembler::AVX_512bit); __ vpsrlq(T0, A2, 42, Assembler::AVX_512bit); - __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, polyCP); // Clear top 22 bits + __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits __ vpsllq(T1, T0, 2, Assembler::AVX_512bit); __ vpaddq(T0, T0, T1, Assembler::AVX_512bit); __ vpaddq(A0, A0, T0, Assembler::AVX_512bit); diff --git a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java index 582c2871c51f3..d24b29cedbfdf 100644 --- a/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java +++ b/src/java.base/share/classes/com/sun/crypto/provider/Poly1305.java @@ -249,9 +249,7 @@ private void processBlock(byte[] block, int offset, int length) { @IntrinsicCandidate private void processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs) { while (length >= BLOCK_LENGTH) { - n.setValue(input, offset, BLOCK_LENGTH, (byte)0x01); - a.setSum(n); // a += (n | 0x01) - a.setProduct(r); // a = (a * r) % p + processBlock(input, offset, BLOCK_LENGTH); offset += BLOCK_LENGTH; length -= BLOCK_LENGTH; } From 8f5942d9ec4824a34a18bd18c37f51f2432c6d0c Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 15 Nov 2022 14:35:56 -0500 Subject: [PATCH 18/23] Vladimir's review comments --- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 7 ++- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 61 ++++++++++--------- src/hotspot/share/opto/library_call.cpp | 3 + 3 files changed, 39 insertions(+), 32 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 51fa435996544..b315d2f91ef76 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -396,10 +396,11 @@ class StubGenerator: public StubCodeGenerator { const Register R0, const Register R1, const Register C1, bool only128); void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P); - void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128); - void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128, const Register t1, const Register t2); + void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t1, const Register t2); void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, - const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG); + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, + const XMMRegister TMP1, const XMMRegister TMP2, const Register rscratch); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index afe45ac820adc..9a81d93a3db03 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -100,7 +100,6 @@ static address poly1305_mask42() { } ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_MASK44[] = { - // OFFSET 64: mask_44 0x00000fffffffffff, 0x00000fffffffffff, 0x00000fffffffffff, 0x00000fffffffffff, 0x00000fffffffffff, 0x00000fffffffffff, @@ -349,12 +348,9 @@ void StubGenerator::poly1305_multiply_scalar( // void StubGenerator::poly1305_limbs_avx512( const XMMRegister D0, const XMMRegister D1, - const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG) + const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, + const XMMRegister TMP1, const XMMRegister TMP2, const Register rscratch) { - const XMMRegister TMP1 = xmm0; - const XMMRegister TMP2 = xmm1; - const Register rscratch = r13; - // Interleave blocks of data __ evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); @@ -379,11 +375,11 @@ void StubGenerator::poly1305_limbs_avx512( * * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) */ -void StubGenerator::poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128) +void StubGenerator::poly1305_limbs( + const Register limbs, const Register a0, const Register a1, + const Register a2, bool only128, + const Register t1, const Register t2) { - const Register t1 = r13; - const Register t2 = r14; - __ movq(a0, Address(limbs, 0)); __ movq(t1, Address(limbs, 8)); __ shlq(t1, 26); @@ -425,11 +421,11 @@ void StubGenerator::poly1305_limbs(const Register limbs, const Register a0, cons /** * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs` */ -void StubGenerator::poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs) +void StubGenerator::poly1305_limbs_out( + const Register a0, const Register a1, const Register a2, + const Register limbs, + const Register t1, const Register t2) { - const Register t1 = r13; - const Register t2 = r14; - // Extra round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 __ movq(t1, a2); @@ -553,9 +549,10 @@ void StubGenerator::poly1305_limbs_out(const Register a0, const Register a1, con // T = A >> 1 // 2 ->1 blocks // A = A + T // a = A -void StubGenerator::poly1305_process_blocks_avx512(const Register input, const Register length, - const Register a0, const Register a1, const Register a2, - const Register r0, const Register r1, const Register c1) +void StubGenerator::poly1305_process_blocks_avx512( + const Register input, const Register length, + const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1) { Label L_process256Loop, L_process256LoopDone; // Register Map: @@ -566,8 +563,10 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R const Register t1 = r13; const Register rscratch = r13; - // poly1305_limbs_avx512 clobbers: xmm0, xmm1 // poly1305_multiply8_avx512 clobbers: xmm0-xmm6 + const XMMRegister TMP1 = xmm0; + const XMMRegister TMP2 = xmm1; + const XMMRegister T0 = xmm2; const XMMRegister T1 = xmm3; const XMMRegister T2 = xmm4; @@ -620,7 +619,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // A2 to have bits 127-88 of all 8 blocks in 8 qwords __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A0, A1, A2, true); + poly1305_limbs_avx512(T0, T1, A0, A1, A2, true, TMP1, TMP2, rscratch); // Add accumulator to the fist message block __ vpaddq(A0, A0, C0, Assembler::AVX_512bit); @@ -633,7 +632,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // A5 to have bits 127-88 of all 8 blocks in 8 qwords __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A3, A4, A5, true); + poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, TMP1, TMP2, rscratch); __ subl(length, 16*16); __ lea(input, Address(input,16*16)); @@ -681,7 +680,7 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T2, B0, B1, B2, false); + poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP1, TMP2, rscratch); // T1 contains the 2 highest bits of the powers of R __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); @@ -773,12 +772,12 @@ void StubGenerator::poly1305_process_blocks_avx512(const Register input, const R // Load and interleave next block of data (128 bytes) __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B0, B1, B2, true); + poly1305_limbs_avx512(T0, T1, B0, B1, B2, true, TMP1, TMP2, rscratch); // Load and interleave next block of data (128 bytes) __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B3, B4, B5, true); + poly1305_limbs_avx512(T0, T1, B3, B4, B5, true, TMP1, TMP2, rscratch); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks R0, R1, R2, R1P, R2P); // R^16..R^16, 4*5*R^16 @@ -948,12 +947,14 @@ address StubGenerator::generate_poly1305_processBlocks() { __ push(r14); __ push(r15); - // void processBlocks(byte[] input, int len, int[5] a, int[5] r) - const Register input = rdi; //input+offset + const Register input = rdi; const Register length = rbx; const Register accumulator = rcx; const Register R = r8; + // void processBlocks(byte[] input, int len, int[5] a, int[5] r) + // input, a, r pointers point at first array element + // java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks #ifdef _WIN64 // c_rarg0 - rcx // c_rarg1 - rdx @@ -980,11 +981,13 @@ address StubGenerator::generate_poly1305_processBlocks() { const Register r0 = r11; // R constant bits 63..0 const Register r1 = r12; // R constant bits 127..64 const Register c1 = r8; // 5*R (upper limb only) + const Register t1 = r13; + const Register t2 = r14; Label L_process16Loop, L_process16LoopDone; // Load R into r1:r0 - poly1305_limbs(R, r0, r1, r1, true); + poly1305_limbs(R, r0, r1, noreg, true, t1, t2); // Compute 5*R (Upper limb only) __ movq(c1, r1); @@ -992,7 +995,7 @@ address StubGenerator::generate_poly1305_processBlocks() { __ addq(c1, r1); // c1 = r1 + (r1 >> 2) // Load accumulator into a2:a1:a0 - poly1305_limbs(accumulator, a0, a1, a2, false); + poly1305_limbs(accumulator, a0, a1, a2, false, t1, t2); // VECTOR LOOP: Minimum of 256 bytes to run vectorized code __ cmpl(length, 16*16); @@ -1018,7 +1021,7 @@ address StubGenerator::generate_poly1305_processBlocks() { __ bind(L_process16LoopDone); // Write output - poly1305_limbs_out(a0, a1, a2, accumulator); + poly1305_limbs_out(a0, a1, a2, accumulator, t1, t2); __ pop(r15); __ pop(r14); @@ -1033,4 +1036,4 @@ address StubGenerator::generate_poly1305_processBlocks() { __ leave(); __ ret(0); return start; -} \ No newline at end of file +} diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 821f7ffb1a55b..274d86f2ebc39 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -6973,6 +6973,9 @@ bool LibraryCallKit::inline_poly1305_processBlocks() { stubName = "poly1305_processBlocks"; if (!stubAddr) return false; + null_check_receiver(); // null-check receiver + if (stopped()) return true; + Node* input = argument(1); Node* input_offset = argument(2); Node* len = argument(3); From 58488f4296d9491d3c6b65793bf630f742140aa7 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 15 Nov 2022 14:41:18 -0500 Subject: [PATCH 19/23] extra whitespace --- src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 9a81d93a3db03..41fe07095f508 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -376,7 +376,7 @@ void StubGenerator::poly1305_limbs_avx512( * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) */ void StubGenerator::poly1305_limbs( - const Register limbs, const Register a0, const Register a1, + const Register limbs, const Register a0, const Register a1, const Register a2, bool only128, const Register t1, const Register t2) { @@ -422,7 +422,7 @@ void StubGenerator::poly1305_limbs( * Break 3×64-bit a2:a1:a0 limbs into 5×26-bit limbs and store out into 5 quadwords at address `limbs` */ void StubGenerator::poly1305_limbs_out( - const Register a0, const Register a1, const Register a2, + const Register a0, const Register a1, const Register a2, const Register limbs, const Register t1, const Register t2) { From cbf4938060c98b64fa8a0b9a08f52454994cc5e2 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Tue, 15 Nov 2022 15:05:52 -0500 Subject: [PATCH 20/23] use noreg properly in poly1305_limbs --- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 2 +- src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index b315d2f91ef76..8bb868dbc85b5 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -396,7 +396,7 @@ class StubGenerator: public StubCodeGenerator { const Register R0, const Register R1, const Register C1, bool only128); void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P); - void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, bool only128, const Register t1, const Register t2); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t1, const Register t2); void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t1, const Register t2); void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 41fe07095f508..16adcc75f4c9e 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -376,8 +376,7 @@ void StubGenerator::poly1305_limbs_avx512( * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R) */ void StubGenerator::poly1305_limbs( - const Register limbs, const Register a0, const Register a1, - const Register a2, bool only128, + const Register limbs, const Register a0, const Register a1, const Register a2, const Register t1, const Register t2) { __ movq(a0, Address(limbs, 0)); @@ -393,13 +392,13 @@ void StubGenerator::poly1305_limbs( __ addq(a0, t1); __ adcq(a1, t2); __ movq(t1, Address(limbs, 32)); - if (!only128) { + if (a2 != noreg) { __ movq(a2, t1); __ shrq(a2, 24); } __ shlq(t1, 40); __ addq(a1, t1); - if (only128) { + if (a2 == noreg) { return; } __ adcq(a2, 0); @@ -987,7 +986,7 @@ address StubGenerator::generate_poly1305_processBlocks() { Label L_process16Loop, L_process16LoopDone; // Load R into r1:r0 - poly1305_limbs(R, r0, r1, noreg, true, t1, t2); + poly1305_limbs(R, r0, r1, noreg, t1, t2); // Compute 5*R (Upper limb only) __ movq(c1, r1); @@ -995,7 +994,7 @@ address StubGenerator::generate_poly1305_processBlocks() { __ addq(c1, r1); // c1 = r1 + (r1 >> 2) // Load accumulator into a2:a1:a0 - poly1305_limbs(accumulator, a0, a1, a2, false, t1, t2); + poly1305_limbs(accumulator, a0, a1, a2, t1, t2); // VECTOR LOOP: Minimum of 256 bytes to run vectorized code __ cmpl(length, 16*16); From dbdfd1dc4d67035d0b5f0361ad2eba1c954921c1 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Wed, 16 Nov 2022 15:46:23 -0500 Subject: [PATCH 21/23] redo register alloc with explicit func params --- src/hotspot/cpu/x86/stubGenerator_x86_64.hpp | 16 +- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 624 +++++++++--------- 2 files changed, 336 insertions(+), 304 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp index 8bb868dbc85b5..5e97e1e9a4456 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64.hpp @@ -392,15 +392,19 @@ class StubGenerator: public StubCodeGenerator { void poly1305_process_blocks_avx512(const Register input, const Register length, const Register A0, const Register A1, const Register A2, const Register R0, const Register R1, const Register C1); - void poly1305_multiply_scalar(const Register A0, const Register A1, const Register A2, - const Register R0, const Register R1, const Register C1, bool only128); + void poly1305_multiply_scalar(const Register a0, const Register a1, const Register a2, + const Register r0, const Register r1, const Register c1, bool only128, + const Register t0, const Register t1, const Register t2, + const Register mulql, const Register mulqh); void poly1305_multiply8_avx512(const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P); - void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t1, const Register t2); - void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t1, const Register t2); + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, + const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H, + const XMMRegister TMP, const Register rscratch); + void poly1305_limbs(const Register limbs, const Register a0, const Register a1, const Register a2, const Register t0, const Register t1); + void poly1305_limbs_out(const Register a0, const Register a1, const Register a2, const Register limbs, const Register t0, const Register t1); void poly1305_limbs_avx512(const XMMRegister D0, const XMMRegister D1, const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, - const XMMRegister TMP1, const XMMRegister TMP2, const Register rscratch); + const XMMRegister TMP, const Register rscratch); // BASE64 stubs diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index 16adcc75f4c9e..ca97d8eb5b9b2 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -52,32 +52,6 @@ // upper-case variables are 8-element vector numbers in 3×44-bit limbs (in zmm registers) // [ ] used to denote vector numbers (with their elements) -// Register Map: -// GPRs: -// input = rdi -// length = rbx -// accumulator = rcx -// R = r8 -// a0 = rsi -// a1 = r9 -// a2 = r10 -// r0 = r11 -// r1 = r12 -// c1 = r8; -// t1 = r13 -// t2 = r14 -// t3 = r15 -// t0 = r14 -// rscratch = r13 -// stack(rsp, rbp) -// imul(rax, rdx) -// ZMMs: -// T: xmm0-6 -// C: xmm7-9 -// A: xmm13-18 -// B: xmm19-24 -// R: xmm25-29 - // Constant Pool: ATTRIBUTE_ALIGNED(64) uint64_t POLY1305_PAD_MSG[] = { 0x0000010000000000, 0x0000010000000000, @@ -148,86 +122,89 @@ static address poly1305_mask44() { // void StubGenerator::poly1305_multiply8_avx512( const XMMRegister A0, const XMMRegister A1, const XMMRegister A2, - const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P) + const XMMRegister R0, const XMMRegister R1, const XMMRegister R2, const XMMRegister R1P, const XMMRegister R2P, + const XMMRegister P0L, const XMMRegister P0H, const XMMRegister P1L, const XMMRegister P1H, const XMMRegister P2L, const XMMRegister P2H, + const XMMRegister TMP, const Register rscratch) { - const XMMRegister P0_L = xmm0; - const XMMRegister P0_H = xmm1; - const XMMRegister P1_L = xmm2; - const XMMRegister P1_H = xmm3; - const XMMRegister P2_L = xmm4; - const XMMRegister P2_H = xmm5; - const XMMRegister TMP1 = xmm6; - const Register rscratch = r13; // Reset partial sums - __ evpxorq(P0_L, P0_L, P0_L, Assembler::AVX_512bit); - __ evpxorq(P0_H, P0_H, P0_H, Assembler::AVX_512bit); - __ evpxorq(P1_L, P1_L, P1_L, Assembler::AVX_512bit); - __ evpxorq(P1_H, P1_H, P1_H, Assembler::AVX_512bit); - __ evpxorq(P2_L, P2_L, P2_L, Assembler::AVX_512bit); - __ evpxorq(P2_H, P2_H, P2_H, Assembler::AVX_512bit); + __ evpxorq(P0L, P0L, P0L, Assembler::AVX_512bit); + __ evpxorq(P0H, P0H, P0H, Assembler::AVX_512bit); + __ evpxorq(P1L, P1L, P1L, Assembler::AVX_512bit); + __ evpxorq(P1H, P1H, P1H, Assembler::AVX_512bit); + __ evpxorq(P2L, P2L, P2L, Assembler::AVX_512bit); + __ evpxorq(P2H, P2H, P2H, Assembler::AVX_512bit); // Calculate partial products - __ evpmadd52luq(P0_L, A2, R1P, Assembler::AVX_512bit); - __ evpmadd52huq(P0_H, A2, R1P, Assembler::AVX_512bit); - __ evpmadd52luq(P1_L, A2, R2P, Assembler::AVX_512bit); - __ evpmadd52huq(P1_H, A2, R2P, Assembler::AVX_512bit); - __ evpmadd52luq(P2_L, A2, R0, Assembler::AVX_512bit); - __ evpmadd52huq(P2_H, A2, R0, Assembler::AVX_512bit); - - __ evpmadd52luq(P1_L, A0, R1, Assembler::AVX_512bit); - __ evpmadd52huq(P1_H, A0, R1, Assembler::AVX_512bit); - __ evpmadd52luq(P2_L, A0, R2, Assembler::AVX_512bit); - __ evpmadd52huq(P2_H, A0, R2, Assembler::AVX_512bit); - __ evpmadd52luq(P0_L, A0, R0, Assembler::AVX_512bit); - __ evpmadd52huq(P0_H, A0, R0, Assembler::AVX_512bit); - - __ evpmadd52luq(P0_L, A1, R2P, Assembler::AVX_512bit); - __ evpmadd52huq(P0_H, A1, R2P, Assembler::AVX_512bit); - __ evpmadd52luq(P1_L, A1, R0, Assembler::AVX_512bit); - __ evpmadd52huq(P1_H, A1, R0, Assembler::AVX_512bit); - __ evpmadd52luq(P2_L, A1, R1, Assembler::AVX_512bit); - __ evpmadd52huq(P2_H, A1, R1, Assembler::AVX_512bit); + // p0 = a2×r1' + // p1 = a2×r2' + // p2 = a2×r0 + __ evpmadd52luq(P0L, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A2, R1P, Assembler::AVX_512bit); + __ evpmadd52luq(P1L, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A2, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A2, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A2, R0, Assembler::AVX_512bit); + + // p0 += a0×r0 + // p1 += a0×r1 + // p2 += a0×r2 + __ evpmadd52luq(P1L, A0, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A0, R1, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A0, R2, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A0, R2, Assembler::AVX_512bit); + __ evpmadd52luq(P0L, A0, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A0, R0, Assembler::AVX_512bit); + + // p0 += a1×r2' + // p1 += a1×r0 + // p2 += a1×r1 + __ evpmadd52luq(P0L, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52huq(P0H, A1, R2P, Assembler::AVX_512bit); + __ evpmadd52luq(P1L, A1, R0, Assembler::AVX_512bit); + __ evpmadd52huq(P1H, A1, R0, Assembler::AVX_512bit); + __ evpmadd52luq(P2L, A1, R1, Assembler::AVX_512bit); + __ evpmadd52huq(P2H, A1, R1, Assembler::AVX_512bit); // Carry propagation: - // (Not quite aligned) | More mathematically correct: - // P2_L P1_L P0_L | P2_L×2^88 + P1_L×2^44 + P0_L×2^0 - // + P2_H P1_H P0_H | + P2_H×2^140 + P1_H×2^96 + P0_H×2^52 - // --------------------------- | ----------------------------------------------- - // = P2_H A2 A1 A0 | = P2_H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 + // (Not quite aligned) | More mathematically correct: + // P2L P1L P0L | P2L×2^88 + P1L×2^44 + P0L×2^0 + // + P2H P1H P0H | + P2H×2^140 + P1H×2^96 + P0H×2^52 + // --------------------------- | ----------------------------------------------- + // = P2H A2 A1 A0 | = P2H×2^130 + A2×2^88 + A1×2^44 + A0×2^0 // - __ vpsrlq(TMP1, P0_L, 44, Assembler::AVX_512bit); - __ evpandq(A0, P0_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + __ vpsrlq(TMP, P0L, 44, Assembler::AVX_512bit); + __ evpandq(A0, P0L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits - __ vpsllq(P0_H, P0_H, 8, Assembler::AVX_512bit); - __ vpaddq(P0_H, P0_H, TMP1, Assembler::AVX_512bit); - __ vpaddq(P1_L, P1_L, P0_H, Assembler::AVX_512bit); - __ evpandq(A1, P1_L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits + __ vpsllq(P0H, P0H, 8, Assembler::AVX_512bit); + __ vpaddq(P0H, P0H, TMP, Assembler::AVX_512bit); + __ vpaddq(P1L, P1L, P0H, Assembler::AVX_512bit); + __ evpandq(A1, P1L, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits - __ vpsrlq(TMP1, P1_L, 44, Assembler::AVX_512bit); - __ vpsllq(P1_H, P1_H, 8, Assembler::AVX_512bit); - __ vpaddq(P1_H, P1_H, TMP1, Assembler::AVX_512bit); - __ vpaddq(P2_L, P2_L, P1_H, Assembler::AVX_512bit); - __ evpandq(A2, P2_L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits + __ vpsrlq(TMP, P1L, 44, Assembler::AVX_512bit); + __ vpsllq(P1H, P1H, 8, Assembler::AVX_512bit); + __ vpaddq(P1H, P1H, TMP, Assembler::AVX_512bit); + __ vpaddq(P2L, P2L, P1H, Assembler::AVX_512bit); + __ evpandq(A2, P2L, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits - __ vpsrlq(TMP1, P2_L, 42, Assembler::AVX_512bit); - __ vpsllq(P2_H, P2_H, 10, Assembler::AVX_512bit); - __ vpaddq(P2_H, P2_H, TMP1, Assembler::AVX_512bit); + __ vpsrlq(TMP, P2L, 42, Assembler::AVX_512bit); + __ vpsllq(P2H, P2H, 10, Assembler::AVX_512bit); + __ vpaddq(P2H, P2H, TMP, Assembler::AVX_512bit); // Reduction: p2->a0->a1 // Multiply by 5 the highest bits (p2 is above 130 bits) - __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); - __ vpsllq(P2_H, P2_H, 2, Assembler::AVX_512bit); - __ vpaddq(A0, A0, P2_H, Assembler::AVX_512bit); - __ vpsrlq(TMP1, A0, 44, Assembler::AVX_512bit); + __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit); + __ vpsllq(P2H, P2H, 2, Assembler::AVX_512bit); + __ vpaddq(A0, A0, P2H, Assembler::AVX_512bit); + __ vpsrlq(TMP, A0, 44, Assembler::AVX_512bit); __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); - __ vpaddq(A1, A1, TMP1, Assembler::AVX_512bit); + __ vpaddq(A1, A1, TMP, Assembler::AVX_512bit); } // Compute product for a single 16-byte message blocks // - Assumes that r = [r1 r0] is only 128 bits (not 130) -// - When only128 is set, Input [a2 a1 a0] is 128 bits (i.e. a2==0) -// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used) +// - Input [a2 a1 a0]; when only128 is set, input is 128 bits (i.e. a2==0) +// - Output [a2 a1 a0] is at least 130 bits (i.e. a2 is used regardless of only128) // // Note 1: a2 here is only two bits so anything above is subject of reduction. // Note 2: Constant c1 = 5xr1 = r1 + (r1 << 2) simplifies multiply with less operations @@ -243,89 +220,88 @@ void StubGenerator::poly1305_multiply8_avx512( // ----------------------------- // [0|L2L] [L1H|L1L] [L0H|L0L] // -// Registers: t3:t2 t1:a0 +// Registers: t2:t1 t0:a0 // // Completing the multiply and adding (with carry) 3x128-bit limbs into // 192-bits again (3x64-bits): // a0 = L0L // a1 = L0H + L1L -// t3 = L1H + L2L +// t2 = L1H + L2L void StubGenerator::poly1305_multiply_scalar( const Register a0, const Register a1, const Register a2, - const Register r0, const Register r1, const Register c1, bool only128) + const Register r0, const Register r1, const Register c1, bool only128, + const Register t0, const Register t1, const Register t2, + const Register mulql, const Register mulqh) { - const Register t1 = r13; - const Register t2 = r14; - const Register t3 = r15; - // Note mulq instruction requires/clobers rax, rdx + // mulq instruction requires/clobers rax, rdx (mulql, mulqh) - // t3:t2 = (a0 * r1) + // t2:t1 = (a0 * r1) __ movq(rax, r1); __ mulq(a0); - __ movq(t2, rax); - __ movq(t3, rdx); + __ movq(t1, rax); + __ movq(t2, rdx); - // t1:a0 = (a0 * r0) + // t0:a0 = (a0 * r0) __ movq(rax, r0); __ mulq(a0); __ movq(a0, rax); // a0 not used in other operations - __ movq(t1, rdx); + __ movq(t0, rdx); - // t3:t2 += (a1 * r0) + // t2:t1 += (a1 * r0) __ movq(rax, r0); __ mulq(a1); - __ addq(t2, rax); - __ adcq(t3, rdx); + __ addq(t1, rax); + __ adcq(t2, rdx); - // t1:a0 += (a1 * r1x5) + // t0:a0 += (a1 * r1x5) __ movq(rax, c1); __ mulq(a1); __ addq(a0, rax); - __ adcq(t1, rdx); + __ adcq(t0, rdx); // Note: a2 is clamped to 2-bits, // r1/r0 is clamped to 60-bits, // their product is less than 2^64. if (only128) { // Accumulator only 128 bits, i.e. a2 == 0 - // just move and add t1-t2 to a1 - __ movq(a1, t1); - __ addq(a1, t2); - __ adcq(t3, 0); + // just move and add t0-t1 to a1 + __ movq(a1, t0); + __ addq(a1, t1); + __ adcq(t2, 0); } else { - // t3:t2 += (a2 * r1x5) + // t2:t1 += (a2 * r1x5) __ movq(a1, a2); // use a1 for a2 __ imulq(a1, c1); - __ addq(t2, a1); - __ adcq(t3, 0); + __ addq(t1, a1); + __ adcq(t2, 0); - __ movq(a1, t1); // t1:a0 => a1:a0 + __ movq(a1, t0); // t0:a0 => a1:a0 - // t3:a1 += (a2 * r0):t2 + // t2:a1 += (a2 * r0):t1 __ imulq(a2, r0); - __ addq(a1, t2); - __ adcq(t3, a2); + __ addq(a1, t1); + __ adcq(t2, a2); } - // At this point, 3 64-bit limbs are in t3:a1:a0 - // t3 can span over more than 2 bits so final partial reduction step is needed. + // At this point, 3 64-bit limbs are in t2:a1:a0 + // t2 can span over more than 2 bits so final partial reduction step is needed. // // Partial reduction (just to fit into 130 bits) - // a2 = t3 & 3 - // k = (t3 & ~3) + (t3 >> 2) + // a2 = t2 & 3 + // k = (t2 & ~3) + (t2 >> 2) // Y x4 + Y x1 // a2:a1:a0 += k // // Result will be in a2:a1:a0 - __ movq(t1, t3); - __ movl(a2, t3); // DWORD - __ andq(t1, ~3); - __ shrq(t3, 2); - __ addq(t1, t3); + __ movq(t0, t2); + __ movl(a2, t2); // DWORD + __ andq(t0, ~3); + __ shrq(t2, 2); + __ addq(t0, t2); __ andl(a2, 3); // DWORD - // a2:a1:a0 += k (kept in t1) - __ addq(a0, t1); + // a2:a1:a0 += k (kept in t0) + __ addq(a0, t0); __ adcq(a1, 0); __ adcl(a2, 0); // DWORD } @@ -349,22 +325,22 @@ void StubGenerator::poly1305_multiply_scalar( void StubGenerator::poly1305_limbs_avx512( const XMMRegister D0, const XMMRegister D1, const XMMRegister L0, const XMMRegister L1, const XMMRegister L2, bool padMSG, - const XMMRegister TMP1, const XMMRegister TMP2, const Register rscratch) + const XMMRegister TMP, const Register rscratch) { // Interleave blocks of data - __ evpunpckhqdq(TMP1, D0, D1, Assembler::AVX_512bit); + __ evpunpckhqdq(TMP, D0, D1, Assembler::AVX_512bit); __ evpunpcklqdq(L0, D0, D1, Assembler::AVX_512bit); // Highest 42-bit limbs of new blocks - __ vpsrlq(L2, TMP1, 24, Assembler::AVX_512bit); + __ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit); if (padMSG) { __ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message } // Middle 44-bit limbs of new blocks __ vpsrlq(L1, L0, 44, Assembler::AVX_512bit); - __ vpsllq(TMP2, TMP1, 20, Assembler::AVX_512bit); - __ vpternlogq(L1, 0xA8, TMP2, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C) + __ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit); + __ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C) // Lowest 44-bit limbs of new blocks __ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); @@ -377,27 +353,27 @@ void StubGenerator::poly1305_limbs_avx512( */ void StubGenerator::poly1305_limbs( const Register limbs, const Register a0, const Register a1, const Register a2, - const Register t1, const Register t2) + const Register t0, const Register t1) { __ movq(a0, Address(limbs, 0)); - __ movq(t1, Address(limbs, 8)); - __ shlq(t1, 26); - __ addq(a0, t1); - __ movq(t1, Address(limbs, 16)); - __ movq(t2, Address(limbs, 24)); - __ movq(a1, t1); - __ shlq(t1, 52); + __ movq(t0, Address(limbs, 8)); + __ shlq(t0, 26); + __ addq(a0, t0); + __ movq(t0, Address(limbs, 16)); + __ movq(t1, Address(limbs, 24)); + __ movq(a1, t0); + __ shlq(t0, 52); __ shrq(a1, 12); - __ shlq(t2, 14); - __ addq(a0, t1); - __ adcq(a1, t2); - __ movq(t1, Address(limbs, 32)); + __ shlq(t1, 14); + __ addq(a0, t0); + __ adcq(a1, t1); + __ movq(t0, Address(limbs, 32)); if (a2 != noreg) { - __ movq(a2, t1); + __ movq(a2, t0); __ shrq(a2, 24); } - __ shlq(t1, 40); - __ addq(a1, t1); + __ shlq(t0, 40); + __ addq(a1, t0); if (a2 == noreg) { return; } @@ -405,14 +381,14 @@ void StubGenerator::poly1305_limbs( // One round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 - __ movq(t1, a2); - __ andq(t1, ~3); + __ movq(t0, a2); + __ andq(t0, ~3); __ andq(a2, 3); - __ movq(t2, t1); - __ shrq(t2, 2); - __ addq(t1, t2); + __ movq(t1, t0); + __ shrq(t1, 2); + __ addq(t0, t1); - __ addq(a0, t1); + __ addq(a0, t0); __ adcq(a1, 0); __ adcq(a2, 0); } @@ -423,50 +399,50 @@ void StubGenerator::poly1305_limbs( void StubGenerator::poly1305_limbs_out( const Register a0, const Register a1, const Register a2, const Register limbs, - const Register t1, const Register t2) + const Register t0, const Register t1) { // Extra round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 - __ movq(t1, a2); - __ andq(t1, ~3); + __ movq(t0, a2); + __ andq(t0, ~3); __ andq(a2, 3); - __ movq(t2, t1); - __ shrq(t2, 2); - __ addq(t1, t2); + __ movq(t1, t0); + __ shrq(t1, 2); + __ addq(t0, t1); - __ addq(a0, t1); + __ addq(a0, t0); __ adcq(a1, 0); __ adcq(a2, 0); // Chop a2:a1:a0 into 26-bit limbs - __ movl(t1, a0); - __ andl(t1, 0x3ffffff); - __ movq(Address(limbs, 0), t1); + __ movl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 0), t0); __ shrq(a0, 26); - __ movl(t1, a0); - __ andl(t1, 0x3ffffff); - __ movq(Address(limbs, 8), t1); + __ movl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 8), t0); __ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1 - __ movl(t1, a1); - __ shll(t1, 12); - __ addl(t1, a0); - __ andl(t1, 0x3ffffff); - __ movq(Address(limbs, 16), t1); + __ movl(t0, a1); + __ shll(t0, 12); + __ addl(t0, a0); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 16), t0); __ shrq(a1, 14); // already used up 14 bits __ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced __ addq(a1, a2); // put remaining bits into a1 - __ movl(t1, a1); - __ andl(t1, 0x3ffffff); - __ movq(Address(limbs, 24), t1); + __ movl(t0, a1); + __ andl(t0, 0x3ffffff); + __ movq(Address(limbs, 24), t0); __ shrq(a1, 26); - __ movl(t1, a1); - //andl(t1, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s) - __ movq(Address(limbs, 32), t1); + __ movl(t0, a1); + //andl(t0, 0x3ffffff); doesnt have to be fully reduced, leave remaining bit(s) + __ movq(Address(limbs, 32), t0); } // This function consumes as many whole 16*16-byte blocks as available in input @@ -474,12 +450,13 @@ void StubGenerator::poly1305_limbs_out( // and [a2 a1 a0] will contain the current accumulator value // // Math Note: -// Put simply, main loop in this function multiplies each message block by r^16; why this works? 'Math' happens before and after.. why as follows: +// Main loop in this function multiplies each message block by r^16; And some glue before and after.. +// Proof (for brevity, split into 4 'rows' instead of 16): // // hash = ((((m1*r + m2)*r + m3)*r ... mn)*r // = m1*r^n + m2*r^(n-1) + ... +mn_1*r^2 + mn*r // Horner's rule // -// = m1*r^n + m4*r^(n-4) + m8*r^(n-8) ... // split into 4 groups for brevity, same applies to 16 +// = m1*r^n + m4*r^(n-4) + m8*r^(n-8) ... // split into 4 groups for brevity, same applies to 16 blocks // + m2*r^(n-1) + m5*r^(n-5) + m9*r^(n-9) ... // + m3*r^(n-2) + m6*r^(n-6) + m10*r^(n-10) ... // + m4*r^(n-3) + m7*r^(n-7) + m11*r^(n-11) ... @@ -487,24 +464,24 @@ void StubGenerator::poly1305_limbs_out( // = r^4 * (m1*r^(n-4) + m4*r^(n-8) + m8 *r^(n-16) ... + mn_3) // factor out r^4..r; same applies to 16 but r^16..r factors // + r^3 * (m2*r^(n-4) + m5*r^(n-8) + m9 *r^(n-16) ... + mn_2) // + r^2 * (m3*r^(n-4) + m6*r^(n-8) + m10*r^(n-16) ... + mn_1) -// + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0) // Note last message group has no multiplier +// + r^1 * (m4*r^(n-4) + m7*r^(n-8) + m11*r^(n-16) ... + mn_0) // Note last column: message group has no multiplier // -// = r^4 * (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) // reverse Horner's rule, for each group -// + r^3 * (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) -// + r^2 * (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) -// + r^1 * (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) +// = (((m1*r^4 + m4)*r^4 + m8 )*r^4 ... + mn_3) * r^4 // reverse Horner's rule, for each group +// + (((m2*r^4 + m5)*r^4 + m9 )*r^4 ... + mn_2) * r^3 // each column is multiplied by r^4, except last +// + (((m3*r^4 + m6)*r^4 + m10)*r^4 ... + mn_1) * r^2 +// + (((m4*r^4 + m7)*r^4 + m11)*r^4 ... + mn_0) * r^1 // // Also see M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" // -// Pseudocode for this function: +// Pseudocode: // * used for poly1305_multiply_scalar // × used for poly1305_multiply8_avx512 // lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) // upper-case variables are 8&16-element vector numbers in 3×44-bit limbs (in zmm registers) // // C = a // [0 0 0 0 0 0 0 a] -// AL = limbs(input) -// AH = limbs(input+8) +// AL = poly1305_limbs_avx512(input) +// AH = poly1305_limbs_avx512(input+8) // AL = AL + C // input+=16, length-=16 // @@ -529,8 +506,8 @@ void StubGenerator::poly1305_limbs_out( // R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] // // for (;length>=16; input+=16, length-=16) -// BL = limbs(input) -// BH = limbs(input+8) +// BL = poly1305_limbs_avx512(input) +// BH = poly1305_limbs_avx512(input+8) // AL = AL × R // AH = AH × R // AL = AL + BL @@ -548,32 +525,64 @@ void StubGenerator::poly1305_limbs_out( // T = A >> 1 // 2 ->1 blocks // A = A + T // a = A +// +// Register Map: +// GPRs: +// input = rdi +// length = rbx +// accumulator = rcx +// R = r8 +// a0 = rsi +// a1 = r9 +// a2 = r10 +// r0 = r11 +// r1 = r12 +// c1 = r8; +// t0 = r13 +// t1 = r14 +// t2 = r15 +// rscratch = r13 +// stack(rsp, rbp) +// mulq(rax, rdx) in poly1305_multiply_scalar +// +// ZMMs: +// TMP: xmm6 +// C: xmm7-9 +// D: xmm2-4 +// T: xmm0-5 +// A: xmm13-18 +// B: xmm19-24 +// R: xmm25-29 void StubGenerator::poly1305_process_blocks_avx512( const Register input, const Register length, const Register a0, const Register a1, const Register a2, const Register r0, const Register r1, const Register c1) { Label L_process256Loop, L_process256LoopDone; - // Register Map: - // reserved: rsp, rbp, rcx - // PARAMs: rdi, rbx, rsi, r8-r12 - // poly1305_multiply_scalar clobbers: r13-r15, rax, rdx - const Register t0 = r14; - const Register t1 = r13; + const Register t0 = r13; + const Register t1 = r14; + const Register t2 = r15; const Register rscratch = r13; + const Register mulql = rax; + const Register mulqh = rdx; - // poly1305_multiply8_avx512 clobbers: xmm0-xmm6 - const XMMRegister TMP1 = xmm0; - const XMMRegister TMP2 = xmm1; + const XMMRegister TMP = xmm6; - const XMMRegister T0 = xmm2; - const XMMRegister T1 = xmm3; - const XMMRegister T2 = xmm4; + const XMMRegister D0 = xmm2; + const XMMRegister D1 = xmm3; + const XMMRegister D2 = xmm4; const XMMRegister C0 = xmm7; const XMMRegister C1 = xmm8; const XMMRegister C2 = xmm9; + const XMMRegister T0 = xmm0; + const XMMRegister T1 = xmm1; + const XMMRegister T2 = xmm2; + const XMMRegister T3 = xmm3; + const XMMRegister T4 = xmm4; + const XMMRegister T5 = xmm5; + const XMMRegister A0 = xmm13; const XMMRegister A1 = xmm14; const XMMRegister A2 = xmm15; @@ -616,9 +625,9 @@ void StubGenerator::poly1305_process_blocks_avx512( // A0 to have bits 0-43 of all 8 blocks in 8 qwords // A1 to have bits 87-44 of all 8 blocks in 8 qwords // A2 to have bits 127-88 of all 8 blocks in 8 qwords - __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); - __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A0, A1, A2, true, TMP1, TMP2, rscratch); + __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, rscratch); // Add accumulator to the fist message block __ vpaddq(A0, A0, C0, Assembler::AVX_512bit); @@ -629,61 +638,67 @@ void StubGenerator::poly1305_process_blocks_avx512( // A3 to have bits 0-43 of all 8 blocks in 8 qwords // A4 to have bits 87-44 of all 8 blocks in 8 qwords // A5 to have bits 127-88 of all 8 blocks in 8 qwords - __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); - __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, A3, A4, A5, true, TMP1, TMP2, rscratch); + __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, rscratch); __ subl(length, 16*16); __ lea(input, Address(input,16*16)); // Compute the powers of R^1..R^4 and form 44-bit limbs of each - // T0 to have bits 0-127 in 4 quadword pairs - // T1 to have bits 128-129 in alternating 8 qwords - __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); - __ movq(T2, r0); - __ vpinsrq(T2, T2, r1, 1); - __ vinserti32x4(T0, T0, T2, 3); + // D0 to have bits 0-127 in 4 quadword pairs + // D1 to have bits 128-129 in alternating 8 qwords + __ vpxorq(D1, D1, D1, Assembler::AVX_512bit); + __ movq(D2, r0); + __ vpinsrq(D2, D2, r1, 1); + __ vinserti32x4(D0, D0, D2, 3); // Calculate R^2 __ movq(a0, r0); __ movq(a1, r1); // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator - poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, true); + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, true, + t0, t1, t2, mulql, mulqh); - __ movq(T2, a0); - __ vpinsrq(T2, T2, a1, 1); - __ vinserti32x4(T0, T0, T2, 2); - __ movq(T2, a2); - __ vinserti32x4(T1, T1, T2, 2); + __ movq(D2, a0); + __ vpinsrq(D2, D2, a1, 1); + __ vinserti32x4(D0, D0, D2, 2); + __ movq(D2, a2); + __ vinserti32x4(D1, D1, D2, 2); // Calculate R^3 - poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); - __ movq(T2, a0); - __ vpinsrq(T2, T2, a1, 1); - __ vinserti32x4(T0, T0, T2, 1); - __ movq(T2, a2); - __ vinserti32x4(T1, T1, T2, 1); + __ movq(D2, a0); + __ vpinsrq(D2, D2, a1, 1); + __ vinserti32x4(D0, D0, D2, 1); + __ movq(D2, a2); + __ vinserti32x4(D1, D1, D2, 1); // Calculate R^4 - poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); - __ movq(T2, a0); - __ vpinsrq(T2, T2, a1, 1); - __ vinserti32x4(T0, T0, T2, 0); - __ movq(T2, a2); - __ vinserti32x4(T1, T1, T2, 0); + __ movq(D2, a0); + __ vpinsrq(D2, D2, a1, 1); + __ vinserti32x4(D0, D0, D2, 0); + __ movq(D2, a2); + __ vinserti32x4(D1, D1, D2, 0); // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords - __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP1, TMP2, rscratch); + __ vpxorq(D2, D2, D2, Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D2, B0, B1, B2, false, TMP, rscratch); - // T1 contains the 2 highest bits of the powers of R - __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); - __ evporq(B2, B2, T1, Assembler::AVX_512bit); + // D1 contains the 2 highest bits of the powers of R + __ vpsllq(D1, D1, 40, Assembler::AVX_512bit); + __ evporq(B2, B2, D1, Assembler::AVX_512bit); // Broadcast 44-bit limbs of R^4 into R0,R1,R2 __ mov(t0, a0); @@ -716,7 +731,8 @@ void StubGenerator::poly1305_process_blocks_avx512( // Calculate R^8-R^5 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^4..R^1 - R0, R1, R2, R1P, R2P); // R^4..R^4, 4*5*R^4 + R0, R1, R2, R1P, R2P, // R^4..R^4, 4*5*R^4 + T0, T1, T2, T3, T4, T5, TMP, rscratch); // Interleave powers of R: R^8 R^4 R^7 R^3 R^6 R^2 R^5 R __ evporq(B0, B0, C0, Assembler::AVX_512bit); @@ -743,7 +759,8 @@ void StubGenerator::poly1305_process_blocks_avx512( // Calculate R^16-R^9 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 - R0, R1, R2, R1P, R2P); // R^8..R^8, 4*5*R^8 + R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8 + T0, T1, T2, T3, T4, T5, TMP, rscratch); // Store R^16-R^9 for later use __ evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); @@ -769,19 +786,21 @@ void StubGenerator::poly1305_process_blocks_avx512( __ jcc(Assembler::less, L_process256LoopDone); // Load and interleave next block of data (128 bytes) - __ evmovdquq(T0, Address(input, 0), Assembler::AVX_512bit); - __ evmovdquq(T1, Address(input, 64), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B0, B1, B2, true, TMP1, TMP2, rscratch); + __ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, rscratch); // Load and interleave next block of data (128 bytes) - __ evmovdquq(T0, Address(input, 64*2), Assembler::AVX_512bit); - __ evmovdquq(T1, Address(input, 64*3), Assembler::AVX_512bit); - poly1305_limbs_avx512(T0, T1, B3, B4, B5, true, TMP1, TMP2, rscratch); + __ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit); + __ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit); + poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, rscratch); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); // R^16..R^16, 4*5*R^16 + R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16 + T0, T1, T2, T3, T4, T5, TMP, rscratch); poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); // R^16..R^16, 4*5*R^16 + R0, R1, R2, R1P, R2P, // R^16..R^16, 4*5*R^16 + T0, T1, T2, T3, T4, T5, TMP, rscratch); __ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator __ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator @@ -807,25 +826,27 @@ void StubGenerator::poly1305_process_blocks_avx512( __ evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); // Generate 4*5*[R^16..R^9] (ignore lowest limb) - __ vpsllq(T0, B1, 2, Assembler::AVX_512bit); - __ vpaddq(B3, B1, T0, Assembler::AVX_512bit); // R1' (R1*5) - __ vpsllq(T0, B2, 2, Assembler::AVX_512bit); - __ vpaddq(B4, B2, T0, Assembler::AVX_512bit); // R2' (R2*5) + __ vpsllq(D0, B1, 2, Assembler::AVX_512bit); + __ vpaddq(B3, B1, D0, Assembler::AVX_512bit); // R1' (R1*5) + __ vpsllq(D0, B2, 2, Assembler::AVX_512bit); + __ vpaddq(B4, B2, D0, Assembler::AVX_512bit); // R2' (R2*5) __ vpsllq(B3, B3, 2, Assembler::AVX_512bit); // 4*5*R __ vpsllq(B4, B4, 2, Assembler::AVX_512bit); // Generate 4*5*[R^8..R^1] (ignore lowest limb) - __ vpsllq(T0, R1, 2, Assembler::AVX_512bit); - __ vpaddq(R1P, R1, T0, Assembler::AVX_512bit); // R1' (R1*5) - __ vpsllq(T0, R2, 2, Assembler::AVX_512bit); - __ vpaddq(R2P, R2, T0, Assembler::AVX_512bit); // R2' (R2*5) + __ vpsllq(D0, R1, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1, D0, Assembler::AVX_512bit); // R1' (R1*5) + __ vpsllq(D0, R2, 2, Assembler::AVX_512bit); + __ vpaddq(R2P, R2, D0, Assembler::AVX_512bit); // R2' (R2*5) __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - B0, B1, B2, B3, B4); // R^16-R^9, R1P, R2P + B0, B1, B2, B3, B4, // R^16-R^9, R1P, R2P + T0, T1, T2, T3, T4, T5, TMP, rscratch); poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P); // R^8-R, R1P, R2P + R0, R1, R2, R1P, R2P, // R^8-R, R1P, R2P + T0, T1, T2, T3, T4, T5, TMP, rscratch); // Add all blocks (horizontally) // 16->8 blocks @@ -834,45 +855,45 @@ void StubGenerator::poly1305_process_blocks_avx512( __ vpaddq(A2, A2, A5, Assembler::AVX_512bit); // 8 -> 4 blocks - __ vextracti64x4(T0, A0, 1); - __ vextracti64x4(T1, A1, 1); - __ vextracti64x4(T2, A2, 1); - __ vpaddq(A0, A0, T0, Assembler::AVX_256bit); - __ vpaddq(A1, A1, T1, Assembler::AVX_256bit); - __ vpaddq(A2, A2, T2, Assembler::AVX_256bit); + __ vextracti64x4(D0, A0, 1); + __ vextracti64x4(D1, A1, 1); + __ vextracti64x4(D2, A2, 1); + __ vpaddq(A0, A0, D0, Assembler::AVX_256bit); + __ vpaddq(A1, A1, D1, Assembler::AVX_256bit); + __ vpaddq(A2, A2, D2, Assembler::AVX_256bit); // 4 -> 2 blocks - __ vextracti32x4(T0, A0, 1); - __ vextracti32x4(T1, A1, 1); - __ vextracti32x4(T2, A2, 1); - __ vpaddq(A0, A0, T0, Assembler::AVX_128bit); - __ vpaddq(A1, A1, T1, Assembler::AVX_128bit); - __ vpaddq(A2, A2, T2, Assembler::AVX_128bit); + __ vextracti32x4(D0, A0, 1); + __ vextracti32x4(D1, A1, 1); + __ vextracti32x4(D2, A2, 1); + __ vpaddq(A0, A0, D0, Assembler::AVX_128bit); + __ vpaddq(A1, A1, D1, Assembler::AVX_128bit); + __ vpaddq(A2, A2, D2, Assembler::AVX_128bit); // 2 -> 1 blocks - __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit); - __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit); - __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit); + __ vpsrldq(D0, A0, 8, Assembler::AVX_128bit); + __ vpsrldq(D1, A1, 8, Assembler::AVX_128bit); + __ vpsrldq(D2, A2, 8, Assembler::AVX_128bit); // Finish folding and clear second qword __ mov64(t0, 0xfd); __ kmovql(k1, t0); - __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); - __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); - __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); + __ evpaddq(A0, k1, A0, D0, false, Assembler::AVX_512bit); + __ evpaddq(A1, k1, A1, D1, false, Assembler::AVX_512bit); + __ evpaddq(A2, k1, A2, D2, false, Assembler::AVX_512bit); // Carry propagation - __ vpsrlq(T0, A0, 44, Assembler::AVX_512bit); + __ vpsrlq(D0, A0, 44, Assembler::AVX_512bit); __ evpandq(A0, A0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits - __ vpaddq(A1, A1, T0, Assembler::AVX_512bit); - __ vpsrlq(T0, A1, 44, Assembler::AVX_512bit); + __ vpaddq(A1, A1, D0, Assembler::AVX_512bit); + __ vpsrlq(D0, A1, 44, Assembler::AVX_512bit); __ evpandq(A1, A1, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // Clear top 20 bits - __ vpaddq(A2, A2, T0, Assembler::AVX_512bit); - __ vpsrlq(T0, A2, 42, Assembler::AVX_512bit); + __ vpaddq(A2, A2, D0, Assembler::AVX_512bit); + __ vpsrlq(D0, A2, 42, Assembler::AVX_512bit); __ evpandq(A2, A2, ExternalAddress(poly1305_mask42()), Assembler::AVX_512bit, rscratch); // Clear top 22 bits - __ vpsllq(T1, T0, 2, Assembler::AVX_512bit); - __ vpaddq(T0, T0, T1, Assembler::AVX_512bit); - __ vpaddq(A0, A0, T0, Assembler::AVX_512bit); + __ vpsllq(D1, D0, 2, Assembler::AVX_512bit); + __ vpaddq(D0, D0, D1, Assembler::AVX_512bit); + __ vpaddq(A0, A0, D0, Assembler::AVX_512bit); // Put together A (accumulator) __ movq(a0, A0); @@ -894,9 +915,9 @@ void StubGenerator::poly1305_process_blocks_avx512( // Cleanup __ vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); __ vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); - __ vpxorq(T0, T0, T0, Assembler::AVX_512bit); - __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); - __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); + __ vpxorq(D0, D0, D0, Assembler::AVX_512bit); + __ vpxorq(D1, D1, D1, Assembler::AVX_512bit); + __ vpxorq(D2, D2, D2, Assembler::AVX_512bit); __ vpxorq(C0, C0, C0, Assembler::AVX_512bit); __ vpxorq(C1, C1, C1, Assembler::AVX_512bit); __ vpxorq(C2, C2, C2, Assembler::AVX_512bit); @@ -946,12 +967,26 @@ address StubGenerator::generate_poly1305_processBlocks() { __ push(r14); __ push(r15); + // Register Map const Register input = rdi; const Register length = rbx; const Register accumulator = rcx; const Register R = r8; - // void processBlocks(byte[] input, int len, int[5] a, int[5] r) + const Register a0 = rsi; // [in/out] accumulator bits 63..0 + const Register a1 = r9; // [in/out] accumulator bits 127..64 + const Register a2 = r10; // [in/out] accumulator bits 195..128 + const Register r0 = r11; // R constant bits 63..0 + const Register r1 = r12; // R constant bits 127..64 + const Register c1 = r8; // 5*R (upper limb only) + const Register t0 = r13; + const Register t1 = r14; + const Register t2 = r15; + const Register mulql = rax; + const Register mulqh = rdx; + + // Normalize input + // pseudo-signature: void poly1305_processBlocks(byte[] input, int length, int[5] accumulator, int[5] R) // input, a, r pointers point at first array element // java headers bypassed in LibraryCallKit::inline_poly1305_processBlocks #ifdef _WIN64 @@ -974,19 +1009,10 @@ address StubGenerator::generate_poly1305_processBlocks() { __ mov(accumulator, c_rarg2); #endif - const Register a0 = rsi; // [in/out] accumulator bits 63..0 - const Register a1 = r9; // [in/out] accumulator bits 127..64 - const Register a2 = r10; // [in/out] accumulator bits 195..128 - const Register r0 = r11; // R constant bits 63..0 - const Register r1 = r12; // R constant bits 127..64 - const Register c1 = r8; // 5*R (upper limb only) - const Register t1 = r13; - const Register t2 = r14; - Label L_process16Loop, L_process16LoopDone; // Load R into r1:r0 - poly1305_limbs(R, r0, r1, noreg, t1, t2); + poly1305_limbs(R, r0, r1, noreg, t0, t1); // Compute 5*R (Upper limb only) __ movq(c1, r1); @@ -994,7 +1020,7 @@ address StubGenerator::generate_poly1305_processBlocks() { __ addq(c1, r1); // c1 = r1 + (r1 >> 2) // Load accumulator into a2:a1:a0 - poly1305_limbs(accumulator, a0, a1, a2, t1, t2); + poly1305_limbs(accumulator, a0, a1, a2, t0, t1); // VECTOR LOOP: Minimum of 256 bytes to run vectorized code __ cmpl(length, 16*16); @@ -1012,7 +1038,9 @@ address StubGenerator::generate_poly1305_processBlocks() { __ addq(a0, Address(input,0)); __ adcq(a1, Address(input,8)); __ adcq(a2,1); - poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false); + poly1305_multiply_scalar(a0, a1, a2, + r0, r1, c1, false, + t0, t1, t2, mulql, mulqh); __ subl(length, 16); __ lea(input, Address(input,16)); @@ -1020,7 +1048,7 @@ address StubGenerator::generate_poly1305_processBlocks() { __ bind(L_process16LoopDone); // Write output - poly1305_limbs_out(a0, a1, a2, accumulator, t1, t2); + poly1305_limbs_out(a0, a1, a2, accumulator, t0, t1); __ pop(r15); __ pop(r14); From 56aed9b1932d973f941e0af1dd87c02337e70981 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Wed, 16 Nov 2022 22:08:26 -0500 Subject: [PATCH 22/23] vzeroall, no spill, reg re-map --- src/hotspot/cpu/x86/assembler_x86.cpp | 7 + src/hotspot/cpu/x86/assembler_x86.hpp | 2 + .../cpu/x86/stubGenerator_x86_64_poly.cpp | 296 ++++++++---------- 3 files changed, 138 insertions(+), 167 deletions(-) diff --git a/src/hotspot/cpu/x86/assembler_x86.cpp b/src/hotspot/cpu/x86/assembler_x86.cpp index 8fb7960453986..d1831aac96c9b 100644 --- a/src/hotspot/cpu/x86/assembler_x86.cpp +++ b/src/hotspot/cpu/x86/assembler_x86.cpp @@ -13553,6 +13553,13 @@ void Assembler::vzeroupper() { emit_copy(code_section(), vzup_code, vzup_len); } +void Assembler::vzeroall() { + assert(VM_Version::supports_avx(), "requires AVX"); + InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false); + (void)vex_prefix_and_encode(0, 0, 0, VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes); + emit_int8(0x77); +} + void Assembler::pushq(Address src) { InstructionMark im(this); emit_int16(get_prefixq(src), (unsigned char)0xFF); diff --git a/src/hotspot/cpu/x86/assembler_x86.hpp b/src/hotspot/cpu/x86/assembler_x86.hpp index 297810a1fb68d..04dbb7907bee1 100644 --- a/src/hotspot/cpu/x86/assembler_x86.hpp +++ b/src/hotspot/cpu/x86/assembler_x86.hpp @@ -2769,6 +2769,8 @@ class Assembler : public AbstractAssembler { // runtime code and native libraries. void vzeroupper(); + void vzeroall(); + // Vector double compares void vcmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len); void evcmppd(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index ca97d8eb5b9b2..c5f8ca1b49022 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -479,7 +479,7 @@ void StubGenerator::poly1305_limbs_out( // lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) // upper-case variables are 8&16-element vector numbers in 3×44-bit limbs (in zmm registers) // -// C = a // [0 0 0 0 0 0 0 a] +// CL = a // [0 0 0 0 0 0 0 a] // AL = poly1305_limbs_avx512(input) // AH = poly1305_limbs_avx512(input+8) // AL = AL + C @@ -493,17 +493,17 @@ void StubGenerator::poly1305_limbs_out( // r = a*r // r^4 = a // -// T = r^4 || r^3 || r^2 || r -// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ] -// C = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1] -// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4] -// B = B×R // [r^8 0 r^7 0 r^6 0 r^5 0 ] -// B = B | C // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1] -// push(B) -// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8] -// B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9] -// push(B) -// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] +// T = r^4 || r^3 || r^2 || r +// B = limbs(T) // [r^4 0 r^3 0 r^2 0 r^1 0 ] +// CL = B >> 1 // [ 0 r^4 0 r^3 0 r^2 0 r^1] +// R = r^4 || r^4 || .. // [r^4 r^4 r^4 r^4 r^4 r^4 r^4 r^4] +// B = B×R // [r^8 0 r^7 0 r^6 0 r^5 0 ] +// B = B | CL // [r^8 r^4 r^7 r^3 r^6 r^2 r^5 r^1] +// CL = B +// R = r^8 || r^8 || .. // [r^8 r^8 r^8 r^8 r^8 r^8 r^8 r^8] +// B = B × R // [r^16 r^12 r^15 r^11 r^14 r^10 r^13 r^9] +// CH = B +// R = r^16 || r^16 || .. // [r^16 r^16 r^16 r^16 r^16 r^16 r^16 r^16] // // for (;length>=16; input+=16, length-=16) // BL = poly1305_limbs_avx512(input) @@ -513,10 +513,8 @@ void StubGenerator::poly1305_limbs_out( // AL = AL + BL // AH = AH + BH // -// B = pop() -// R = pop() -// AL = AL × R -// AH = AH × B +// AL = AL × CL +// AH = AH × CH // A = AL + AH // 16->8 blocks // T = A >> 4 // 8 ->4 blocks // A = A + T @@ -546,13 +544,13 @@ void StubGenerator::poly1305_limbs_out( // mulq(rax, rdx) in poly1305_multiply_scalar // // ZMMs: -// TMP: xmm6 -// C: xmm7-9 -// D: xmm2-4 -// T: xmm0-5 -// A: xmm13-18 -// B: xmm19-24 -// R: xmm25-29 +// D: xmm0-1 +// TMP: xmm2 +// T: xmm3-8 +// A: xmm9-14 +// B: xmm15-20 +// C: xmm21-26 +// R: xmm27-31 void StubGenerator::poly1305_process_blocks_avx512( const Register input, const Register length, const Register a0, const Register a1, const Register a2, @@ -566,44 +564,43 @@ void StubGenerator::poly1305_process_blocks_avx512( const Register mulql = rax; const Register mulqh = rdx; - const XMMRegister TMP = xmm6; - - const XMMRegister D0 = xmm2; - const XMMRegister D1 = xmm3; - const XMMRegister D2 = xmm4; - - const XMMRegister C0 = xmm7; - const XMMRegister C1 = xmm8; - const XMMRegister C2 = xmm9; - - const XMMRegister T0 = xmm0; - const XMMRegister T1 = xmm1; - const XMMRegister T2 = xmm2; - const XMMRegister T3 = xmm3; - const XMMRegister T4 = xmm4; - const XMMRegister T5 = xmm5; - - const XMMRegister A0 = xmm13; - const XMMRegister A1 = xmm14; - const XMMRegister A2 = xmm15; - const XMMRegister A3 = xmm16; - const XMMRegister A4 = xmm17; - const XMMRegister A5 = xmm18; - - const XMMRegister B0 = xmm19; - const XMMRegister B1 = xmm20; - const XMMRegister B2 = xmm21; - const XMMRegister B3 = xmm22; - const XMMRegister B4 = xmm23; - const XMMRegister B5 = xmm24; - - const XMMRegister R0 = xmm25; - const XMMRegister R1 = xmm26; - const XMMRegister R2 = xmm27; - const XMMRegister R1P = xmm28; - const XMMRegister R2P = xmm29; - - __ subq(rsp, (512/8)*6); // Make room to store 6 zmm registers (powers of R) + const XMMRegister D0 = xmm0; + const XMMRegister D1 = xmm1; + const XMMRegister TMP = xmm2; + + const XMMRegister T0 = xmm3; + const XMMRegister T1 = xmm4; + const XMMRegister T2 = xmm5; + const XMMRegister T3 = xmm6; + const XMMRegister T4 = xmm7; + const XMMRegister T5 = xmm8; + + const XMMRegister A0 = xmm9; + const XMMRegister A1 = xmm10; + const XMMRegister A2 = xmm11; + const XMMRegister A3 = xmm12; + const XMMRegister A4 = xmm13; + const XMMRegister A5 = xmm14; + + const XMMRegister B0 = xmm15; + const XMMRegister B1 = xmm16; + const XMMRegister B2 = xmm17; + const XMMRegister B3 = xmm18; + const XMMRegister B4 = xmm19; + const XMMRegister B5 = xmm20; + + const XMMRegister C0 = xmm21; + const XMMRegister C1 = xmm22; + const XMMRegister C2 = xmm23; + const XMMRegister C3 = xmm24; + const XMMRegister C4 = xmm25; + const XMMRegister C5 = xmm26; + + const XMMRegister R0 = xmm27; + const XMMRegister R1 = xmm28; + const XMMRegister R2 = xmm29; + const XMMRegister R1P = xmm30; + const XMMRegister R2P = xmm31; // Spread accumulator into 44-bit limbs in quadwords C0,C1,C2 __ movq(t0, a0); @@ -646,12 +643,12 @@ void StubGenerator::poly1305_process_blocks_avx512( __ lea(input, Address(input,16*16)); // Compute the powers of R^1..R^4 and form 44-bit limbs of each - // D0 to have bits 0-127 in 4 quadword pairs - // D1 to have bits 128-129 in alternating 8 qwords - __ vpxorq(D1, D1, D1, Assembler::AVX_512bit); - __ movq(D2, r0); - __ vpinsrq(D2, D2, r1, 1); - __ vinserti32x4(D0, D0, D2, 3); + // T0 to have bits 0-127 in 4 quadword pairs + // T1 to have bits 128-129 in alternating 8 qwords + __ vpxorq(T1, T1, T1, Assembler::AVX_512bit); + __ movq(T2, r0); + __ vpinsrq(T2, T2, r1, 1); + __ vinserti32x4(T0, T0, T2, 3); // Calculate R^2 __ movq(a0, r0); @@ -661,44 +658,44 @@ void StubGenerator::poly1305_process_blocks_avx512( r0, r1, c1, true, t0, t1, t2, mulql, mulqh); - __ movq(D2, a0); - __ vpinsrq(D2, D2, a1, 1); - __ vinserti32x4(D0, D0, D2, 2); - __ movq(D2, a2); - __ vinserti32x4(D1, D1, D2, 2); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 2); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 2); // Calculate R^3 poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false, t0, t1, t2, mulql, mulqh); - __ movq(D2, a0); - __ vpinsrq(D2, D2, a1, 1); - __ vinserti32x4(D0, D0, D2, 1); - __ movq(D2, a2); - __ vinserti32x4(D1, D1, D2, 1); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 1); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 1); // Calculate R^4 poly1305_multiply_scalar(a0, a1, a2, r0, r1, c1, false, t0, t1, t2, mulql, mulqh); - __ movq(D2, a0); - __ vpinsrq(D2, D2, a1, 1); - __ vinserti32x4(D0, D0, D2, 0); - __ movq(D2, a2); - __ vinserti32x4(D1, D1, D2, 0); + __ movq(T2, a0); + __ vpinsrq(T2, T2, a1, 1); + __ vinserti32x4(T0, T0, T2, 0); + __ movq(T2, a2); + __ vinserti32x4(T1, T1, T2, 0); // Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords - __ vpxorq(D2, D2, D2, Assembler::AVX_512bit); - poly1305_limbs_avx512(D0, D2, B0, B1, B2, false, TMP, rscratch); + __ vpxorq(T2, T2, T2, Assembler::AVX_512bit); + poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, rscratch); - // D1 contains the 2 highest bits of the powers of R - __ vpsllq(D1, D1, 40, Assembler::AVX_512bit); - __ evporq(B2, B2, D1, Assembler::AVX_512bit); + // T1 contains the 2 highest bits of the powers of R + __ vpsllq(T1, T1, 40, Assembler::AVX_512bit); + __ evporq(B2, B2, T1, Assembler::AVX_512bit); // Broadcast 44-bit limbs of R^4 into R0,R1,R2 __ mov(t0, a0); @@ -739,6 +736,11 @@ void StubGenerator::poly1305_process_blocks_avx512( __ evporq(B1, B1, C1, Assembler::AVX_512bit); __ evporq(B2, B2, C2, Assembler::AVX_512bit); + // Store R^8-R for later use + __ evmovdquq(C0, B0, Assembler::AVX_512bit); + __ evmovdquq(C1, B1, Assembler::AVX_512bit); + __ evmovdquq(C2, B2, Assembler::AVX_512bit); + // Broadcast R^8 __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); __ vpbroadcastq(R1, B1, Assembler::AVX_512bit); @@ -752,20 +754,15 @@ void StubGenerator::poly1305_process_blocks_avx512( __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); - // Store R^8-R for later use - __ evmovdquq(Address(rsp, 64*0), B0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*1), B1, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*2), B2, Assembler::AVX_512bit); - // Calculate R^16-R^9 poly1305_multiply8_avx512(B0, B1, B2, // ACC=R^8..R^1 R0, R1, R2, R1P, R2P, // R^8..R^8, 4*5*R^8 T0, T1, T2, T3, T4, T5, TMP, rscratch); // Store R^16-R^9 for later use - __ evmovdquq(Address(rsp, 64*3), B0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*4), B1, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*5), B2, Assembler::AVX_512bit); + __ evmovdquq(C3, B0, Assembler::AVX_512bit); + __ evmovdquq(C4, B1, Assembler::AVX_512bit); + __ evmovdquq(C5, B2, Assembler::AVX_512bit); // Broadcast R^16 __ vpbroadcastq(R0, B0, Assembler::AVX_512bit); @@ -816,36 +813,28 @@ void StubGenerator::poly1305_process_blocks_avx512( __ bind(L_process256LoopDone); // Tail processing: Need to multiply ACC by R^16..R^1 and add it all up into a single scalar value - // Read R^16-R^9 - __ evmovdquq(B0, Address(rsp, 64*3), Assembler::AVX_512bit); - __ evmovdquq(B1, Address(rsp, 64*4), Assembler::AVX_512bit); - __ evmovdquq(B2, Address(rsp, 64*5), Assembler::AVX_512bit); - // Read R^8-R - __ evmovdquq(R0, Address(rsp, 64*0), Assembler::AVX_512bit); - __ evmovdquq(R1, Address(rsp, 64*1), Assembler::AVX_512bit); - __ evmovdquq(R2, Address(rsp, 64*2), Assembler::AVX_512bit); - // Generate 4*5*[R^16..R^9] (ignore lowest limb) - __ vpsllq(D0, B1, 2, Assembler::AVX_512bit); - __ vpaddq(B3, B1, D0, Assembler::AVX_512bit); // R1' (R1*5) - __ vpsllq(D0, B2, 2, Assembler::AVX_512bit); - __ vpaddq(B4, B2, D0, Assembler::AVX_512bit); // R2' (R2*5) - __ vpsllq(B3, B3, 2, Assembler::AVX_512bit); // 4*5*R - __ vpsllq(B4, B4, 2, Assembler::AVX_512bit); + // Use D0 ~ R1P, D1 ~ R2P for higher powers + __ vpsllq(R1P, C4, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, C5, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, C4, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, C5, Assembler::AVX_512bit); + __ vpsllq(D0, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 + __ vpsllq(D1, R2P, 2, Assembler::AVX_512bit); // Generate 4*5*[R^8..R^1] (ignore lowest limb) - __ vpsllq(D0, R1, 2, Assembler::AVX_512bit); - __ vpaddq(R1P, R1, D0, Assembler::AVX_512bit); // R1' (R1*5) - __ vpsllq(D0, R2, 2, Assembler::AVX_512bit); - __ vpaddq(R2P, R2, D0, Assembler::AVX_512bit); // R2' (R2*5) - __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R + __ vpsllq(R1P, C1, 2, Assembler::AVX_512bit); + __ vpsllq(R2P, C2, 2, Assembler::AVX_512bit); + __ vpaddq(R1P, R1P, C1, Assembler::AVX_512bit); // 5*R^8 + __ vpaddq(R2P, R2P, C2, Assembler::AVX_512bit); + __ vpsllq(R1P, R1P, 2, Assembler::AVX_512bit); // 4*5*R^8 __ vpsllq(R2P, R2P, 2, Assembler::AVX_512bit); poly1305_multiply8_avx512(A0, A1, A2, // MSG/ACC 16 blocks - B0, B1, B2, B3, B4, // R^16-R^9, R1P, R2P + C3, C4, C5, D0, D1, // R^16-R^9, R1P, R2P T0, T1, T2, T3, T4, T5, TMP, rscratch); poly1305_multiply8_avx512(A3, A4, A5, // MSG/ACC 16 blocks - R0, R1, R2, R1P, R2P, // R^8-R, R1P, R2P + C0, C1, C2, R1P, R2P, // R^8-R, R1P, R2P T0, T1, T2, T3, T4, T5, TMP, rscratch); // Add all blocks (horizontally) @@ -855,32 +844,32 @@ void StubGenerator::poly1305_process_blocks_avx512( __ vpaddq(A2, A2, A5, Assembler::AVX_512bit); // 8 -> 4 blocks - __ vextracti64x4(D0, A0, 1); - __ vextracti64x4(D1, A1, 1); - __ vextracti64x4(D2, A2, 1); - __ vpaddq(A0, A0, D0, Assembler::AVX_256bit); - __ vpaddq(A1, A1, D1, Assembler::AVX_256bit); - __ vpaddq(A2, A2, D2, Assembler::AVX_256bit); + __ vextracti64x4(T0, A0, 1); + __ vextracti64x4(T1, A1, 1); + __ vextracti64x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_256bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_256bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_256bit); // 4 -> 2 blocks - __ vextracti32x4(D0, A0, 1); - __ vextracti32x4(D1, A1, 1); - __ vextracti32x4(D2, A2, 1); - __ vpaddq(A0, A0, D0, Assembler::AVX_128bit); - __ vpaddq(A1, A1, D1, Assembler::AVX_128bit); - __ vpaddq(A2, A2, D2, Assembler::AVX_128bit); + __ vextracti32x4(T0, A0, 1); + __ vextracti32x4(T1, A1, 1); + __ vextracti32x4(T2, A2, 1); + __ vpaddq(A0, A0, T0, Assembler::AVX_128bit); + __ vpaddq(A1, A1, T1, Assembler::AVX_128bit); + __ vpaddq(A2, A2, T2, Assembler::AVX_128bit); // 2 -> 1 blocks - __ vpsrldq(D0, A0, 8, Assembler::AVX_128bit); - __ vpsrldq(D1, A1, 8, Assembler::AVX_128bit); - __ vpsrldq(D2, A2, 8, Assembler::AVX_128bit); + __ vpsrldq(T0, A0, 8, Assembler::AVX_128bit); + __ vpsrldq(T1, A1, 8, Assembler::AVX_128bit); + __ vpsrldq(T2, A2, 8, Assembler::AVX_128bit); // Finish folding and clear second qword __ mov64(t0, 0xfd); __ kmovql(k1, t0); - __ evpaddq(A0, k1, A0, D0, false, Assembler::AVX_512bit); - __ evpaddq(A1, k1, A1, D1, false, Assembler::AVX_512bit); - __ evpaddq(A2, k1, A2, D2, false, Assembler::AVX_512bit); + __ evpaddq(A0, k1, A0, T0, false, Assembler::AVX_512bit); + __ evpaddq(A1, k1, A1, T1, false, Assembler::AVX_512bit); + __ evpaddq(A2, k1, A2, T2, false, Assembler::AVX_512bit); // Carry propagation __ vpsrlq(D0, A0, 44, Assembler::AVX_512bit); @@ -913,38 +902,11 @@ void StubGenerator::poly1305_process_blocks_avx512( __ adcq(a2, 0); // Cleanup - __ vpxorq(xmm0, xmm0, xmm0, Assembler::AVX_512bit); - __ vpxorq(xmm1, xmm1, xmm1, Assembler::AVX_512bit); - __ vpxorq(D0, D0, D0, Assembler::AVX_512bit); - __ vpxorq(D1, D1, D1, Assembler::AVX_512bit); - __ vpxorq(D2, D2, D2, Assembler::AVX_512bit); - __ vpxorq(C0, C0, C0, Assembler::AVX_512bit); - __ vpxorq(C1, C1, C1, Assembler::AVX_512bit); - __ vpxorq(C2, C2, C2, Assembler::AVX_512bit); - __ vpxorq(A0, A0, A0, Assembler::AVX_512bit); - __ vpxorq(A1, A1, A1, Assembler::AVX_512bit); - __ vpxorq(A2, A2, A2, Assembler::AVX_512bit); - __ vpxorq(A3, A3, A3, Assembler::AVX_512bit); - __ vpxorq(A4, A4, A4, Assembler::AVX_512bit); - __ vpxorq(A5, A5, A5, Assembler::AVX_512bit); - __ vpxorq(B0, B0, B0, Assembler::AVX_512bit); - __ vpxorq(B1, B1, B1, Assembler::AVX_512bit); - __ vpxorq(B2, B2, B2, Assembler::AVX_512bit); - __ vpxorq(B3, B3, B3, Assembler::AVX_512bit); - __ vpxorq(B4, B4, B4, Assembler::AVX_512bit); - __ vpxorq(B5, B5, B5, Assembler::AVX_512bit); - __ vpxorq(R0, R0, R0, Assembler::AVX_512bit); - __ vpxorq(R1, R1, R1, Assembler::AVX_512bit); - __ vpxorq(R2, R2, R2, Assembler::AVX_512bit); - __ vpxorq(R1P, R1P, R1P, Assembler::AVX_512bit); - __ vpxorq(R2P, R2P, R2P, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*3), A0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*4), A0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*5), A0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*0), A0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*1), A0, Assembler::AVX_512bit); - __ evmovdquq(Address(rsp, 64*2), A0, Assembler::AVX_512bit); - __ addq(rsp, 512/8*6); // (powers of R) + // Zero out zmm0-zmm31. + __ vzeroall(); + for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) { + __ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit); + } } // This function consumes as many whole 16-byte blocks as available in input From 08ea45e5d4ab071a56dd1a50d517dc3ed4428553 Mon Sep 17 00:00:00 2001 From: Volodymyr Paprotski Date: Thu, 17 Nov 2022 15:36:10 -0500 Subject: [PATCH 23/23] remove early return --- .../cpu/x86/stubGenerator_x86_64_poly.cpp | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp index c5f8ca1b49022..97f9f6ccc470b 100644 --- a/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp +++ b/src/hotspot/cpu/x86/stubGenerator_x86_64_poly.cpp @@ -374,23 +374,22 @@ void StubGenerator::poly1305_limbs( } __ shlq(t0, 40); __ addq(a1, t0); - if (a2 == noreg) { - return; + if (a2 != noreg) { + __ adcq(a2, 0); + + // One round of reduction + // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 + __ movq(t0, a2); + __ andq(t0, ~3); + __ andq(a2, 3); + __ movq(t1, t0); + __ shrq(t1, 2); + __ addq(t0, t1); + + __ addq(a0, t0); + __ adcq(a1, 0); + __ adcq(a2, 0); } - __ adcq(a2, 0); - - // One round of reduction - // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0 - __ movq(t0, a2); - __ andq(t0, ~3); - __ andq(a2, 3); - __ movq(t1, t0); - __ shrq(t1, 2); - __ addq(t0, t1); - - __ addq(a0, t0); - __ adcq(a1, 0); - __ adcq(a2, 0); } /**