Skip to content

Commit 8474e69

Browse files
Yi-Fan TsaiPaul Hohensee
Yi-Fan Tsai
authored and
Paul Hohensee
committed
8308465: Reduce memory accesses in AArch64 MD5 intrinsic
Reviewed-by: aph, phh
1 parent f99ad11 commit 8474e69

File tree

1 file changed

+127
-94
lines changed

1 file changed

+127
-94
lines changed

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 127 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -3332,9 +3332,36 @@ class StubGenerator: public StubCodeGenerator {
33323332
return start;
33333333
}
33343334

3335+
class Cached64Bytes {
3336+
private:
3337+
MacroAssembler *_masm;
3338+
Register _regs[8];
3339+
3340+
public:
3341+
Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3342+
assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3343+
auto it = rs.begin();
3344+
for (auto &r: _regs) {
3345+
r = *it;
3346+
++it;
3347+
}
3348+
}
3349+
3350+
void gen_loads(Register base) {
3351+
for (int i = 0; i < 8; i += 2) {
3352+
__ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3353+
}
3354+
}
3355+
3356+
// Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3357+
void extract_u32(Register dest, int i) {
3358+
__ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3359+
}
3360+
};
3361+
33353362
// Utility routines for md5.
33363363
// Clobbers r10 and r11.
3337-
void md5_FF(Register buf, Register r1, Register r2, Register r3, Register r4,
3364+
void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
33383365
int k, int s, int t) {
33393366
Register rscratch3 = r10;
33403367
Register rscratch4 = r11;
@@ -3343,22 +3370,22 @@ class StubGenerator: public StubCodeGenerator {
33433370
__ movw(rscratch2, t);
33443371
__ andw(rscratch3, rscratch3, r2);
33453372
__ addw(rscratch4, r1, rscratch2);
3346-
__ ldrw(rscratch1, Address(buf, k*4));
3373+
reg_cache.extract_u32(rscratch1, k);
33473374
__ eorw(rscratch3, rscratch3, r4);
33483375
__ addw(rscratch4, rscratch4, rscratch1);
33493376
__ addw(rscratch3, rscratch3, rscratch4);
33503377
__ rorw(rscratch2, rscratch3, 32 - s);
33513378
__ addw(r1, rscratch2, r2);
33523379
}
33533380

3354-
void md5_GG(Register buf, Register r1, Register r2, Register r3, Register r4,
3381+
void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
33553382
int k, int s, int t) {
33563383
Register rscratch3 = r10;
33573384
Register rscratch4 = r11;
33583385

33593386
__ andw(rscratch3, r2, r4);
33603387
__ bicw(rscratch4, r3, r4);
3361-
__ ldrw(rscratch1, Address(buf, k*4));
3388+
reg_cache.extract_u32(rscratch1, k);
33623389
__ movw(rscratch2, t);
33633390
__ orrw(rscratch3, rscratch3, rscratch4);
33643391
__ addw(rscratch4, r1, rscratch2);
@@ -3368,31 +3395,31 @@ class StubGenerator: public StubCodeGenerator {
33683395
__ addw(r1, rscratch2, r2);
33693396
}
33703397

3371-
void md5_HH(Register buf, Register r1, Register r2, Register r3, Register r4,
3398+
void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
33723399
int k, int s, int t) {
33733400
Register rscratch3 = r10;
33743401
Register rscratch4 = r11;
33753402

33763403
__ eorw(rscratch3, r3, r4);
33773404
__ movw(rscratch2, t);
33783405
__ addw(rscratch4, r1, rscratch2);
3379-
__ ldrw(rscratch1, Address(buf, k*4));
3406+
reg_cache.extract_u32(rscratch1, k);
33803407
__ eorw(rscratch3, rscratch3, r2);
33813408
__ addw(rscratch4, rscratch4, rscratch1);
33823409
__ addw(rscratch3, rscratch3, rscratch4);
33833410
__ rorw(rscratch2, rscratch3, 32 - s);
33843411
__ addw(r1, rscratch2, r2);
33853412
}
33863413

3387-
void md5_II(Register buf, Register r1, Register r2, Register r3, Register r4,
3414+
void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
33883415
int k, int s, int t) {
33893416
Register rscratch3 = r10;
33903417
Register rscratch4 = r11;
33913418

33923419
__ movw(rscratch3, t);
33933420
__ ornw(rscratch2, r2, r4);
33943421
__ addw(rscratch4, r1, rscratch3);
3395-
__ ldrw(rscratch1, Address(buf, k*4));
3422+
reg_cache.extract_u32(rscratch1, k);
33963423
__ eorw(rscratch3, rscratch2, r3);
33973424
__ addw(rscratch4, rscratch4, rscratch1);
33983425
__ addw(rscratch3, rscratch3, rscratch4);
@@ -3424,103 +3451,104 @@ class StubGenerator: public StubCodeGenerator {
34243451
Register rscratch3 = r10;
34253452
Register rscratch4 = r11;
34263453

3454+
Register state_regs[2] = { r12, r13 };
3455+
RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3456+
Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3457+
3458+
__ push(saved_regs, sp);
3459+
3460+
__ ldp(state_regs[0], state_regs[1], Address(state));
3461+
__ ubfx(a, state_regs[0], 0, 32);
3462+
__ ubfx(b, state_regs[0], 32, 32);
3463+
__ ubfx(c, state_regs[1], 0, 32);
3464+
__ ubfx(d, state_regs[1], 32, 32);
3465+
34273466
Label md5_loop;
34283467
__ BIND(md5_loop);
34293468

3430-
// Save hash values for addition after rounds
3431-
__ ldrw(a, Address(state, 0));
3432-
__ ldrw(b, Address(state, 4));
3433-
__ ldrw(c, Address(state, 8));
3434-
__ ldrw(d, Address(state, 12));
3469+
reg_cache.gen_loads(buf);
34353470

34363471
// Round 1
3437-
md5_FF(buf, a, b, c, d, 0, 7, 0xd76aa478);
3438-
md5_FF(buf, d, a, b, c, 1, 12, 0xe8c7b756);
3439-
md5_FF(buf, c, d, a, b, 2, 17, 0x242070db);
3440-
md5_FF(buf, b, c, d, a, 3, 22, 0xc1bdceee);
3441-
md5_FF(buf, a, b, c, d, 4, 7, 0xf57c0faf);
3442-
md5_FF(buf, d, a, b, c, 5, 12, 0x4787c62a);
3443-
md5_FF(buf, c, d, a, b, 6, 17, 0xa8304613);
3444-
md5_FF(buf, b, c, d, a, 7, 22, 0xfd469501);
3445-
md5_FF(buf, a, b, c, d, 8, 7, 0x698098d8);
3446-
md5_FF(buf, d, a, b, c, 9, 12, 0x8b44f7af);
3447-
md5_FF(buf, c, d, a, b, 10, 17, 0xffff5bb1);
3448-
md5_FF(buf, b, c, d, a, 11, 22, 0x895cd7be);
3449-
md5_FF(buf, a, b, c, d, 12, 7, 0x6b901122);
3450-
md5_FF(buf, d, a, b, c, 13, 12, 0xfd987193);
3451-
md5_FF(buf, c, d, a, b, 14, 17, 0xa679438e);
3452-
md5_FF(buf, b, c, d, a, 15, 22, 0x49b40821);
3472+
md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3473+
md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3474+
md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3475+
md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3476+
md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3477+
md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3478+
md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3479+
md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3480+
md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3481+
md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3482+
md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3483+
md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3484+
md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3485+
md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3486+
md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3487+
md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
34533488

34543489
// Round 2
3455-
md5_GG(buf, a, b, c, d, 1, 5, 0xf61e2562);
3456-
md5_GG(buf, d, a, b, c, 6, 9, 0xc040b340);
3457-
md5_GG(buf, c, d, a, b, 11, 14, 0x265e5a51);
3458-
md5_GG(buf, b, c, d, a, 0, 20, 0xe9b6c7aa);
3459-
md5_GG(buf, a, b, c, d, 5, 5, 0xd62f105d);
3460-
md5_GG(buf, d, a, b, c, 10, 9, 0x02441453);
3461-
md5_GG(buf, c, d, a, b, 15, 14, 0xd8a1e681);
3462-
md5_GG(buf, b, c, d, a, 4, 20, 0xe7d3fbc8);
3463-
md5_GG(buf, a, b, c, d, 9, 5, 0x21e1cde6);
3464-
md5_GG(buf, d, a, b, c, 14, 9, 0xc33707d6);
3465-
md5_GG(buf, c, d, a, b, 3, 14, 0xf4d50d87);
3466-
md5_GG(buf, b, c, d, a, 8, 20, 0x455a14ed);
3467-
md5_GG(buf, a, b, c, d, 13, 5, 0xa9e3e905);
3468-
md5_GG(buf, d, a, b, c, 2, 9, 0xfcefa3f8);
3469-
md5_GG(buf, c, d, a, b, 7, 14, 0x676f02d9);
3470-
md5_GG(buf, b, c, d, a, 12, 20, 0x8d2a4c8a);
3490+
md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3491+
md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3492+
md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3493+
md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3494+
md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3495+
md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3496+
md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3497+
md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3498+
md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3499+
md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3500+
md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3501+
md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3502+
md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3503+
md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3504+
md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3505+
md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
34713506

34723507
// Round 3
3473-
md5_HH(buf, a, b, c, d, 5, 4, 0xfffa3942);
3474-
md5_HH(buf, d, a, b, c, 8, 11, 0x8771f681);
3475-
md5_HH(buf, c, d, a, b, 11, 16, 0x6d9d6122);
3476-
md5_HH(buf, b, c, d, a, 14, 23, 0xfde5380c);
3477-
md5_HH(buf, a, b, c, d, 1, 4, 0xa4beea44);
3478-
md5_HH(buf, d, a, b, c, 4, 11, 0x4bdecfa9);
3479-
md5_HH(buf, c, d, a, b, 7, 16, 0xf6bb4b60);
3480-
md5_HH(buf, b, c, d, a, 10, 23, 0xbebfbc70);
3481-
md5_HH(buf, a, b, c, d, 13, 4, 0x289b7ec6);
3482-
md5_HH(buf, d, a, b, c, 0, 11, 0xeaa127fa);
3483-
md5_HH(buf, c, d, a, b, 3, 16, 0xd4ef3085);
3484-
md5_HH(buf, b, c, d, a, 6, 23, 0x04881d05);
3485-
md5_HH(buf, a, b, c, d, 9, 4, 0xd9d4d039);
3486-
md5_HH(buf, d, a, b, c, 12, 11, 0xe6db99e5);
3487-
md5_HH(buf, c, d, a, b, 15, 16, 0x1fa27cf8);
3488-
md5_HH(buf, b, c, d, a, 2, 23, 0xc4ac5665);
3508+
md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3509+
md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3510+
md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3511+
md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3512+
md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3513+
md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3514+
md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3515+
md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3516+
md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3517+
md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3518+
md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3519+
md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3520+
md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3521+
md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3522+
md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3523+
md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
34893524

34903525
// Round 4
3491-
md5_II(buf, a, b, c, d, 0, 6, 0xf4292244);
3492-
md5_II(buf, d, a, b, c, 7, 10, 0x432aff97);
3493-
md5_II(buf, c, d, a, b, 14, 15, 0xab9423a7);
3494-
md5_II(buf, b, c, d, a, 5, 21, 0xfc93a039);
3495-
md5_II(buf, a, b, c, d, 12, 6, 0x655b59c3);
3496-
md5_II(buf, d, a, b, c, 3, 10, 0x8f0ccc92);
3497-
md5_II(buf, c, d, a, b, 10, 15, 0xffeff47d);
3498-
md5_II(buf, b, c, d, a, 1, 21, 0x85845dd1);
3499-
md5_II(buf, a, b, c, d, 8, 6, 0x6fa87e4f);
3500-
md5_II(buf, d, a, b, c, 15, 10, 0xfe2ce6e0);
3501-
md5_II(buf, c, d, a, b, 6, 15, 0xa3014314);
3502-
md5_II(buf, b, c, d, a, 13, 21, 0x4e0811a1);
3503-
md5_II(buf, a, b, c, d, 4, 6, 0xf7537e82);
3504-
md5_II(buf, d, a, b, c, 11, 10, 0xbd3af235);
3505-
md5_II(buf, c, d, a, b, 2, 15, 0x2ad7d2bb);
3506-
md5_II(buf, b, c, d, a, 9, 21, 0xeb86d391);
3507-
3508-
// write hash values back in the correct order
3509-
__ ldrw(rscratch1, Address(state, 0));
3510-
__ addw(rscratch1, rscratch1, a);
3511-
__ strw(rscratch1, Address(state, 0));
3512-
3513-
__ ldrw(rscratch2, Address(state, 4));
3514-
__ addw(rscratch2, rscratch2, b);
3515-
__ strw(rscratch2, Address(state, 4));
3516-
3517-
__ ldrw(rscratch3, Address(state, 8));
3518-
__ addw(rscratch3, rscratch3, c);
3519-
__ strw(rscratch3, Address(state, 8));
3520-
3521-
__ ldrw(rscratch4, Address(state, 12));
3522-
__ addw(rscratch4, rscratch4, d);
3523-
__ strw(rscratch4, Address(state, 12));
3526+
md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3527+
md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3528+
md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3529+
md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3530+
md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3531+
md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3532+
md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3533+
md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3534+
md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3535+
md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3536+
md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3537+
md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3538+
md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3539+
md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3540+
md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3541+
md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3542+
3543+
__ addw(a, state_regs[0], a);
3544+
__ ubfx(rscratch2, state_regs[0], 32, 32);
3545+
__ addw(b, rscratch2, b);
3546+
__ addw(c, state_regs[1], c);
3547+
__ ubfx(rscratch4, state_regs[1], 32, 32);
3548+
__ addw(d, rscratch4, d);
3549+
3550+
__ orr(state_regs[0], a, b, Assembler::LSL, 32);
3551+
__ orr(state_regs[1], c, d, Assembler::LSL, 32);
35243552

35253553
if (multi_block) {
35263554
__ add(buf, buf, 64);
@@ -3530,6 +3558,11 @@ class StubGenerator: public StubCodeGenerator {
35303558
__ mov(c_rarg0, ofs); // return ofs
35313559
}
35323560

3561+
// write hash values back in the correct order
3562+
__ stp(state_regs[0], state_regs[1], Address(state));
3563+
3564+
__ pop(saved_regs, sp);
3565+
35333566
__ ret(lr);
35343567

35353568
return start;

0 commit comments

Comments
 (0)