From 6ed4348c5abe093318e35a0e50e4205f082dc179 Mon Sep 17 00:00:00 2001 From: Yi-Fan Tsai Date: Tue, 8 Nov 2022 15:39:14 +0000 Subject: [PATCH] 8296548: Improve MD5 intrinsic for x86_64 The LEA instruction loads the effective address, but MD5 intrinsic uses it for computing values than addresses. This usage potentially uses more cycles than ADDs and reduces the throughput. This change replaces LEA: r1 = r1 + rsi * 1 + t with ADDs: r1 += t; r1 += rsi. Microbenchmark evaluation shows ~40% performance improvement on Haswell, Broadwell, Skylake, and Cascade Lake. There is ~20% improvement on 2nd gen Epyc. No performance change for the same microbenchmark on Ice Lake and 3rd gen Epyc. Similar results can also be observed in TestMD5Intrinsics and TestMD5MultiBlockIntrinsics with a more moderate improvement, e.g. ~15% improvement in throughput on Haswell. --- src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp index 506147166de34..9f6ed1b9d657d 100644 --- a/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp +++ b/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp @@ -66,16 +66,18 @@ void MacroAssembler::fast_md5(Register buf, Address state, Address ofs, Address movl(rdx, Address(rdi, 12)); #define FF(r1, r2, r3, r4, k, s, t) \ + addl(r1, t); \ movl(rsi, r3); \ addl(r1, Address(buf, k*4)); \ xorl(rsi, r4); \ andl(rsi, r2); \ xorl(rsi, r4); \ - leal(r1, Address(r1, rsi, Address::times_1, t)); \ + addl(r1, rsi); \ roll(r1, s); \ addl(r1, r2); #define GG(r1, r2, r3, r4, k, s, t) \ + addl(r1, t); \ movl(rsi, r4); \ movl(rdi, r4); \ addl(r1, Address(buf, k*4)); \ @@ -83,26 +85,28 @@ void MacroAssembler::fast_md5(Register buf, Address state, Address ofs, Address andl(rdi, r2); \ andl(rsi, r3); \ orl(rsi, rdi); \ - leal(r1, Address(r1, rsi, Address::times_1, t)); \ + addl(r1, rsi); \ roll(r1, s); \ addl(r1, r2); #define HH(r1, r2, r3, r4, k, s, t) \ + addl(r1, t); \ movl(rsi, r3); \ addl(r1, Address(buf, k*4)); \ xorl(rsi, r4); \ xorl(rsi, r2); \ - leal(r1, Address(r1, rsi, Address::times_1, t)); \ + addl(r1, rsi); \ roll(r1, s); \ addl(r1, r2); #define II(r1, r2, r3, r4, k, s, t) \ + addl(r1, t); \ movl(rsi, r4); \ notl(rsi); \ addl(r1, Address(buf, k*4)); \ orl(rsi, r2); \ xorl(rsi, r3); \ - leal(r1, Address(r1, rsi, Address::times_1, t)); \ + addl(r1, rsi); \ roll(r1, s); \ addl(r1, r2);