From 6ed4348c5abe093318e35a0e50e4205f082dc179 Mon Sep 17 00:00:00 2001
From: Yi-Fan Tsai <yftsai@amazon.com>
Date: Tue, 8 Nov 2022 15:39:14 +0000
Subject: [PATCH] 8296548: Improve MD5 intrinsic for x86_64

The LEA instruction loads the effective address, but MD5 intrinsic uses
it for computing values than addresses. This usage potentially uses
more cycles than ADDs and reduces the throughput.

This change replaces
    LEA:  r1 = r1 + rsi * 1 + t
with
    ADDs: r1 += t; r1 += rsi.

Microbenchmark evaluation shows ~40% performance improvement on Haswell,
Broadwell, Skylake, and Cascade Lake. There is ~20% improvement on 2nd
gen Epyc.

No performance change for the same microbenchmark on Ice Lake and 3rd
gen Epyc.

Similar results can also be observed in TestMD5Intrinsics and
TestMD5MultiBlockIntrinsics with a more moderate improvement, e.g. ~15%
improvement in throughput on Haswell.
---
 src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp b/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp
index 506147166de34..9f6ed1b9d657d 100644
--- a/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp
+++ b/src/hotspot/cpu/x86/macroAssembler_x86_md5.cpp
@@ -66,16 +66,18 @@ void MacroAssembler::fast_md5(Register buf, Address state, Address ofs, Address
   movl(rdx, Address(rdi, 12));
 
 #define FF(r1, r2, r3, r4, k, s, t)              \
+  addl(r1, t);                                   \
   movl(rsi, r3);                                 \
   addl(r1, Address(buf, k*4));                   \
   xorl(rsi, r4);                                 \
   andl(rsi, r2);                                 \
   xorl(rsi, r4);                                 \
-  leal(r1, Address(r1, rsi, Address::times_1, t)); \
+  addl(r1, rsi);                                 \
   roll(r1, s);                                   \
   addl(r1, r2);
 
 #define GG(r1, r2, r3, r4, k, s, t)              \
+  addl(r1, t);                                   \
   movl(rsi, r4);                                 \
   movl(rdi, r4);                                 \
   addl(r1, Address(buf, k*4));                   \
@@ -83,26 +85,28 @@ void MacroAssembler::fast_md5(Register buf, Address state, Address ofs, Address
   andl(rdi, r2);                                 \
   andl(rsi, r3);                                 \
   orl(rsi, rdi);                                 \
-  leal(r1, Address(r1, rsi, Address::times_1, t)); \
+  addl(r1, rsi);                                 \
   roll(r1, s);                                   \
   addl(r1, r2);
 
 #define HH(r1, r2, r3, r4, k, s, t)              \
+  addl(r1, t);                                   \
   movl(rsi, r3);                                 \
   addl(r1, Address(buf, k*4));                   \
   xorl(rsi, r4);                                 \
   xorl(rsi, r2);                                 \
-  leal(r1, Address(r1, rsi, Address::times_1, t)); \
+  addl(r1, rsi);                                 \
   roll(r1, s);                                   \
   addl(r1, r2);
 
 #define II(r1, r2, r3, r4, k, s, t)              \
+  addl(r1, t);                                   \
   movl(rsi, r4);                                 \
   notl(rsi);                                     \
   addl(r1, Address(buf, k*4));                   \
   orl(rsi, r2);                                  \
   xorl(rsi, r3);                                 \
-  leal(r1, Address(r1, rsi, Address::times_1, t)); \
+  addl(r1, rsi);                                 \
   roll(r1, s);                                   \
   addl(r1, r2);