Skip to content

Commit 207bd00

Browse files
committed
8313756: [BACKOUT] 8308682: Enhance AES performance
Reviewed-by: thartmann
1 parent 823f5b9 commit 207bd00

File tree

7 files changed

+37
-106
lines changed

7 files changed

+37
-106
lines changed

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 14 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2944,23 +2944,6 @@ class StubGenerator: public StubCodeGenerator {
29442944
return start;
29452945
}
29462946

2947-
// Big-endian 128-bit + 64-bit -> 128-bit addition.
2948-
// Inputs: 128-bits. in is preserved.
2949-
// The least-significant 64-bit word is in the upper dword of each vector.
2950-
// inc (the 64-bit increment) is preserved. Its lower dword must be zero.
2951-
// Output: result
2952-
void be_add_128_64(FloatRegister result, FloatRegister in,
2953-
FloatRegister inc, FloatRegister tmp) {
2954-
assert_different_registers(result, tmp, inc);
2955-
2956-
__ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
2957-
// input
2958-
__ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
2959-
__ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
2960-
// MSD == 0 (must be!) to LSD
2961-
__ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
2962-
}
2963-
29642947
// CTR AES crypt.
29652948
// Arguments:
29662949
//
@@ -3070,16 +3053,13 @@ class StubGenerator: public StubCodeGenerator {
30703053
// Setup the counter
30713054
__ movi(v4, __ T4S, 0);
30723055
__ movi(v5, __ T4S, 1);
3073-
__ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3056+
__ ins(v4, __ S, v5, 3, 3); // v4 contains { 0, 0, 0, 1 }
30743057

3075-
// 128-bit big-endian increment
3076-
__ ld1(v0, __ T16B, counter);
3077-
__ rev64(v16, __ T16B, v0);
3078-
be_add_128_64(v16, v16, v4, /*tmp*/v5);
3079-
__ rev64(v16, __ T16B, v16);
3080-
__ st1(v16, __ T16B, counter);
3081-
// Previous counter value is in v0
3082-
// v4 contains { 0, 1 }
3058+
__ ld1(v0, __ T16B, counter); // Load the counter into v0
3059+
__ rev32(v16, __ T16B, v0);
3060+
__ addv(v16, __ T4S, v16, v4);
3061+
__ rev32(v16, __ T16B, v16);
3062+
__ st1(v16, __ T16B, counter); // Save the incremented counter back
30833063

30843064
{
30853065
// We have fewer than bulk_width blocks of data left. Encrypt
@@ -3111,9 +3091,9 @@ class StubGenerator: public StubCodeGenerator {
31113091

31123092
// Increment the counter, store it back
31133093
__ orr(v0, __ T16B, v16, v16);
3114-
__ rev64(v16, __ T16B, v16);
3115-
be_add_128_64(v16, v16, v4, /*tmp*/v5);
3116-
__ rev64(v16, __ T16B, v16);
3094+
__ rev32(v16, __ T16B, v16);
3095+
__ addv(v16, __ T4S, v16, v4);
3096+
__ rev32(v16, __ T16B, v16);
31173097
__ st1(v16, __ T16B, counter); // Save the incremented counter back
31183098

31193099
__ b(inner_loop);
@@ -3161,7 +3141,7 @@ class StubGenerator: public StubCodeGenerator {
31613141
// Keys should already be loaded into the correct registers
31623142

31633143
__ ld1(v0, __ T16B, counter); // v0 contains the first counter
3164-
__ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3144+
__ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
31653145

31663146
// AES/CTR loop
31673147
{
@@ -3171,12 +3151,12 @@ class StubGenerator: public StubCodeGenerator {
31713151
// Setup the counters
31723152
__ movi(v8, __ T4S, 0);
31733153
__ movi(v9, __ T4S, 1);
3174-
__ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3154+
__ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
31753155

31763156
for (int i = 0; i < bulk_width; i++) {
31773157
FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3178-
__ rev64(v0_ofs, __ T16B, v16);
3179-
be_add_128_64(v16, v16, v8, /*tmp*/v9);
3158+
__ rev32(v0_ofs, __ T16B, v16);
3159+
__ addv(v16, __ T4S, v16, v8);
31803160
}
31813161

31823162
__ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
@@ -3206,7 +3186,7 @@ class StubGenerator: public StubCodeGenerator {
32063186
}
32073187

32083188
// Save the counter back where it goes
3209-
__ rev64(v16, __ T16B, v16);
3189+
__ rev32(v16, __ T16B, v16);
32103190
__ st1(v16, __ T16B, counter);
32113191

32123192
__ pop(saved_regs, sp);

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4431,14 +4431,6 @@ void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, Compa
44314431
emit_int24(0x3E, (0xC0 | encode), vcc);
44324432
}
44334433

4434-
void Assembler::evpcmpuq(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len) {
4435-
assert(VM_Version::supports_avx512vl(), "");
4436-
InstructionAttr attributes(vector_len, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
4437-
attributes.set_is_evex_instruction();
4438-
int encode = vex_prefix_and_encode(kdst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
4439-
emit_int24(0x1E, (0xC0 | encode), vcc);
4440-
}
4441-
44424434
void Assembler::evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len) {
44434435
assert(VM_Version::supports_avx512vlbw(), "");
44444436
InstructionMark im(this);

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1806,8 +1806,6 @@ class Assembler : public AbstractAssembler {
18061806
void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
18071807
void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
18081808

1809-
void evpcmpuq(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
1810-
18111809
void pcmpeqw(XMMRegister dst, XMMRegister src);
18121810
void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
18131811
void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);

src/hotspot/cpu/x86/macroAssembler_x86.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9257,17 +9257,6 @@ void MacroAssembler::evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral sr
92579257
}
92589258
}
92599259

9260-
void MacroAssembler::evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch) {
9261-
assert(rscratch != noreg || always_reachable(src), "missing");
9262-
9263-
if (reachable(src)) {
9264-
Assembler::evpaddq(dst, mask, nds, as_Address(src), merge, vector_len);
9265-
} else {
9266-
lea(rscratch, src);
9267-
Assembler::evpaddq(dst, mask, nds, Address(rscratch, 0), merge, vector_len);
9268-
}
9269-
}
9270-
92719260
void MacroAssembler::evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
92729261
assert(rscratch != noreg || always_reachable(src), "missing");
92739262

src/hotspot/cpu/x86/macroAssembler_x86.hpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1788,9 +1788,6 @@ class MacroAssembler: public Assembler {
17881788
using Assembler::evpandq;
17891789
void evpandq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
17901790

1791-
using Assembler::evpaddq;
1792-
void evpaddq(XMMRegister dst, KRegister mask, XMMRegister nds, AddressLiteral src, bool merge, int vector_len, Register rscratch = noreg);
1793-
17941791
using Assembler::evporq;
17951792
void evporq(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch = noreg);
17961793

src/hotspot/cpu/x86/stubGenerator_x86_64.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,7 @@ class StubGenerator: public StubCodeGenerator {
364364

365365
// Utility routine for increase 128bit counter (iv in CTR mode)
366366
void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block);
367-
void ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
368-
int vector_len, KRegister ktmp, Register rscratch = noreg);
367+
369368
void generate_aes_stubs();
370369

371370

src/hotspot/cpu/x86/stubGenerator_x86_64_aes.cpp

Lines changed: 22 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,6 @@ static address counter_mask_linc32_addr() {
121121
return (address)COUNTER_MASK_LINC32;
122122
}
123123

124-
ATTRIBUTE_ALIGNED(64) uint64_t COUNTER_MASK_ONES[] = {
125-
0x0000000000000000UL, 0x0000000000000001UL,
126-
0x0000000000000000UL, 0x0000000000000001UL,
127-
0x0000000000000000UL, 0x0000000000000001UL,
128-
0x0000000000000000UL, 0x0000000000000001UL,
129-
};
130-
static address counter_mask_ones_addr() {
131-
return (address)COUNTER_MASK_ONES;
132-
}
133-
134124
ATTRIBUTE_ALIGNED(64) static const uint64_t GHASH_POLYNOMIAL_REDUCTION[] = {
135125
0x00000001C2000000UL, 0xC200000000000000UL,
136126
0x00000001C2000000UL, 0xC200000000000000UL,
@@ -1633,17 +1623,6 @@ void StubGenerator::ev_load_key(XMMRegister xmmdst, Register key, int offset, Re
16331623
__ evshufi64x2(xmmdst, xmmdst, xmmdst, 0x0, Assembler::AVX_512bit);
16341624
}
16351625

1636-
// Add 128-bit integers in xmmsrc1 to xmmsrc2, then place the result in xmmdst.
1637-
// Clobber ktmp and rscratch.
1638-
// Used by aesctr_encrypt.
1639-
void StubGenerator::ev_add128(XMMRegister xmmdst, XMMRegister xmmsrc1, XMMRegister xmmsrc2,
1640-
int vector_len, KRegister ktmp, Register rscratch) {
1641-
__ vpaddq(xmmdst, xmmsrc1, xmmsrc2, vector_len);
1642-
__ evpcmpuq(ktmp, xmmdst, xmmsrc2, __ lt, vector_len);
1643-
__ kshiftlbl(ktmp, ktmp, 1);
1644-
__ evpaddq(xmmdst, ktmp, xmmdst, ExternalAddress(counter_mask_ones_addr()), /*merge*/true,
1645-
vector_len, rscratch);
1646-
}
16471626

16481627
// AES-ECB Encrypt Operation
16491628
void StubGenerator::aesecb_encrypt(Register src_addr, Register dest_addr, Register key, Register len) {
@@ -2067,6 +2046,7 @@ void StubGenerator::aesecb_decrypt(Register src_addr, Register dest_addr, Regist
20672046
}
20682047

20692048

2049+
20702050
// AES Counter Mode using VAES instructions
20712051
void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Register key, Register counter,
20722052
Register len_reg, Register used, Register used_addr, Register saved_encCounter_start) {
@@ -2124,17 +2104,14 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
21242104
// The counter is incremented after each block i.e. 16 bytes is processed;
21252105
// each zmm register has 4 counter values as its MSB
21262106
// the counters are incremented in parallel
2127-
2128-
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2129-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2130-
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2131-
ev_add128(xmm9, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2132-
ev_add128(xmm10, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2133-
ev_add128(xmm11, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2134-
ev_add128(xmm12, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2135-
ev_add128(xmm13, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2136-
ev_add128(xmm14, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2137-
ev_add128(xmm15, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2107+
__ vpaddd(xmm8, xmm8, ExternalAddress(counter_mask_linc0_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2108+
__ vpaddd(xmm9, xmm8, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2109+
__ vpaddd(xmm10, xmm9, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2110+
__ vpaddd(xmm11, xmm10, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2111+
__ vpaddd(xmm12, xmm11, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2112+
__ vpaddd(xmm13, xmm12, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2113+
__ vpaddd(xmm14, xmm13, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
2114+
__ vpaddd(xmm15, xmm14, ExternalAddress(counter_mask_linc4_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
21382115

21392116
// load linc32 mask in zmm register.linc32 increments counter by 32
21402117
__ evmovdquq(xmm19, ExternalAddress(counter_mask_linc32_addr()), Assembler::AVX_512bit, r15 /*rscratch*/);
@@ -2182,21 +2159,21 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
21822159
// This is followed by incrementing counter values in zmm8-zmm15.
21832160
// Since we will be processing 32 blocks at a time, the counter is incremented by 32.
21842161
roundEnc(xmm21, 7);
2185-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2162+
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
21862163
roundEnc(xmm22, 7);
2187-
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2164+
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
21882165
roundEnc(xmm23, 7);
2189-
ev_add128(xmm10, xmm10, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2166+
__ vpaddq(xmm10, xmm10, xmm19, Assembler::AVX_512bit);
21902167
roundEnc(xmm24, 7);
2191-
ev_add128(xmm11, xmm11, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2168+
__ vpaddq(xmm11, xmm11, xmm19, Assembler::AVX_512bit);
21922169
roundEnc(xmm25, 7);
2193-
ev_add128(xmm12, xmm12, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2170+
__ vpaddq(xmm12, xmm12, xmm19, Assembler::AVX_512bit);
21942171
roundEnc(xmm26, 7);
2195-
ev_add128(xmm13, xmm13, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2172+
__ vpaddq(xmm13, xmm13, xmm19, Assembler::AVX_512bit);
21962173
roundEnc(xmm27, 7);
2197-
ev_add128(xmm14, xmm14, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2174+
__ vpaddq(xmm14, xmm14, xmm19, Assembler::AVX_512bit);
21982175
roundEnc(xmm28, 7);
2199-
ev_add128(xmm15, xmm15, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2176+
__ vpaddq(xmm15, xmm15, xmm19, Assembler::AVX_512bit);
22002177
roundEnc(xmm29, 7);
22012178

22022179
__ cmpl(rounds, 52);
@@ -2274,8 +2251,8 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
22742251
__ vpshufb(xmm3, xmm11, xmm16, Assembler::AVX_512bit);
22752252
__ evpxorq(xmm3, xmm3, xmm20, Assembler::AVX_512bit);
22762253
// Increment counter values by 16
2277-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2278-
ev_add128(xmm9, xmm9, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2254+
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
2255+
__ vpaddq(xmm9, xmm9, xmm19, Assembler::AVX_512bit);
22792256
// AES encode rounds
22802257
roundEnc(xmm21, 3);
22812258
roundEnc(xmm22, 3);
@@ -2342,7 +2319,7 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
23422319
__ vpshufb(xmm1, xmm9, xmm16, Assembler::AVX_512bit);
23432320
__ evpxorq(xmm1, xmm1, xmm20, Assembler::AVX_512bit);
23442321
// increment counter by 8
2345-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2322+
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
23462323
// AES encode
23472324
roundEnc(xmm21, 1);
23482325
roundEnc(xmm22, 1);
@@ -2399,9 +2376,8 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
23992376
// XOR counter with first roundkey
24002377
__ vpshufb(xmm0, xmm8, xmm16, Assembler::AVX_512bit);
24012378
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_512bit);
2402-
24032379
// Increment counter
2404-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_512bit, /*ktmp*/k1, r15 /*rscratch*/);
2380+
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_512bit);
24052381
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_512bit);
24062382
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_512bit);
24072383
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_512bit);
@@ -2451,7 +2427,7 @@ void StubGenerator::aesctr_encrypt(Register src_addr, Register dest_addr, Regist
24512427
__ evpxorq(xmm0, xmm0, xmm20, Assembler::AVX_128bit);
24522428
__ vaesenc(xmm0, xmm0, xmm21, Assembler::AVX_128bit);
24532429
// Increment counter by 1
2454-
ev_add128(xmm8, xmm8, xmm19, Assembler::AVX_128bit, /*ktmp*/k1, r15 /*rscratch*/);
2430+
__ vpaddq(xmm8, xmm8, xmm19, Assembler::AVX_128bit);
24552431
__ vaesenc(xmm0, xmm0, xmm22, Assembler::AVX_128bit);
24562432
__ vaesenc(xmm0, xmm0, xmm23, Assembler::AVX_128bit);
24572433
__ vaesenc(xmm0, xmm0, xmm24, Assembler::AVX_128bit);

0 commit comments

Comments
 (0)