Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
662d3d8
skeleton code
suchismith1993 Jul 18, 2024
209f2e5
add instructions for gcm shift
suchismith1993 Jul 19, 2024
ea99cc8
further code
suchismith1993 Jul 22, 2024
ea4d59a
ghash
suchismith1993 Sep 3, 2024
a37cb6f
chnage byte order
suchismith1993 Oct 8, 2024
cbf9883
reduction phase
suchismith1993 Oct 17, 2024
5e5a9bd
reduction phase
suchismith1993 Oct 17, 2024
775e719
reduction phase
suchismith1993 Oct 21, 2024
5ff7222
alignment issues
suchismith1993 Oct 25, 2024
fb5c946
alginment for load and store
suchismith1993 Nov 11, 2024
f41e52c
alginment for load and store
suchismith1993 Dec 9, 2024
8e1678d
alginment for load and store
suchismith1993 Dec 9, 2024
8ad8559
spaces
suchismith1993 Dec 9, 2024
912b8b8
spaces fix
suchismith1993 Dec 9, 2024
8e9b251
spaces fix
suchismith1993 Dec 9, 2024
b66e408
comments
suchismith1993 Dec 10, 2024
47190bf
change load instructions
suchismith1993 Dec 14, 2024
ff901a8
using power 8 loadinstructions
suchismith1993 Dec 16, 2024
acbca2d
using power 8 loadinstructions
suchismith1993 Dec 18, 2024
1aebac8
comments
suchismith1993 Dec 18, 2024
b2766a9
comments
suchismith1993 Dec 18, 2024
f357d6e
spaces fix
suchismith1993 Dec 18, 2024
f4d2e75
spaces fix
suchismith1993 Dec 18, 2024
3487414
reuse registers to reduce count
suchismith1993 Jan 8, 2025
59acef1
check for vsx
suchismith1993 Jan 8, 2025
c323460
clearing bits
suchismith1993 Jan 8, 2025
6970291
clearing bits
suchismith1993 Jan 8, 2025
09df762
assertion for blocks
suchismith1993 Jan 9, 2025
eadd408
restore
suchismith1993 Jan 9, 2025
3bd8a27
Comments and vsx check
suchismith1993 Jan 22, 2025
047142f
update references
suchismith1993 Jan 22, 2025
61cb973
spaces
suchismith1993 Jan 22, 2025
9ac07dc
spaces
suchismith1993 Jan 22, 2025
41b1d8c
vsx logic change
suchismith1993 Jan 22, 2025
bca7f69
comments
suchismith1993 Jan 23, 2025
50ca470
indentation
suchismith1993 Jan 23, 2025
24f3379
permute vHigh,vLow
suchismith1993 Jan 28, 2025
b4aa7a8
restore chnges
suchismith1993 Jan 28, 2025
fc2f1c3
restore chnges
suchismith1993 Jan 28, 2025
6388d4e
Merge branch 'openjdk:master' into ghash_processblocks
suchismith1993 Feb 5, 2025
068a248
adapt Condition registers
suchismith1993 Feb 5, 2025
d22fcf2
Merge branch 'openjdk:master' into ghash_processblocks
suchismith1993 Feb 7, 2025
79d470b
Aligned accesses
suchismith1993 Feb 10, 2025
1272375
common code function
suchismith1993 Feb 10, 2025
cf3f1d4
common code function
suchismith1993 Feb 11, 2025
a7d9a96
common code function
suchismith1993 Feb 11, 2025
5b94a7a
Single load inside loop
suchismith1993 Feb 17, 2025
b3fe9d6
remove not needed variables
suchismith1993 Feb 19, 2025
b37b09d
remove not needed variables
suchismith1993 Feb 19, 2025
68565d4
change branch and remove not needed variables
suchismith1993 Feb 20, 2025
467af71
change branch and remove not needed variables
suchismith1993 Feb 20, 2025
55ba886
change pattern for Linux, fix for AIX
suchismith1993 Feb 26, 2025
474b891
change pattern for Linux, fix for AIX
suchismith1993 Feb 26, 2025
3bca30f
use vsplitsb
suchismith1993 Feb 27, 2025
90d58e6
comments
suchismith1993 Mar 18, 2025
89dfcaf
comments
suchismith1993 Mar 18, 2025
a41fdc2
comments
suchismith1993 Mar 18, 2025
9231e7b
Merge branch 'openjdk:master' into ghash_processblocks
suchismith1993 Apr 24, 2025
423c868
masm
suchismith1993 Apr 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,177 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

// Computes the Galois/Counter Mode (GCM) product and reduction.
//
// This function performs polynomial multiplication of the subkey H with
// the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
// The subkey H is divided into lower, middle, and higher halves.
// The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
// The final computed value is stored back into `vState`.
static void computeGCMProduct(MacroAssembler* _masm,
VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
VectorRegister vCombinedResult, VectorRegister vSwappedH) {
__ vxor(vH, vH, vState);
__ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
__ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
__ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
__ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction
__ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M
__ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M
__ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
__ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
__ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap
__ vxor(vLowProduct, vLowProduct, vReducedLow);
__ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap
__ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant
__ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
__ vxor(vState, vLowProduct, vCombinedResult);
}

// Generate stub for ghash process blocks.
//
// Arguments for generated stub:
// state: R3_ARG1 (long[] state)
// subkeyH: R4_ARG2 (long[] subH)
// data: R5_ARG3 (byte[] data)
// blocks: R6_ARG4 (number of 16-byte blocks to process)
//
// The polynomials are processed in bit-reflected order for efficiency reasons.
// This optimization leverages the structure of the Galois field arithmetic
// to minimize the number of bit manipulations required during multiplication.
// For an explanation of how this works, refer :
// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
// Architecture Processor"
// http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
//
//
address generate_ghash_processBlocks() {
StubCodeMark mark(this, "StubRoutines", "ghash");
address start = __ function_entry();

// Registers for parameters
Register state = R3_ARG1; // long[] state
Register subkeyH = R4_ARG2; // long[] subH
Register data = R5_ARG3; // byte[] data
Register blocks = R6_ARG4;
Register temp1 = R8;
// Vector Registers
VectorRegister vZero = VR0;
VectorRegister vH = VR1;
VectorRegister vLowerH = VR2;
VectorRegister vHigherH = VR3;
VectorRegister vLowProduct = VR4;
VectorRegister vMidProduct = VR5;
VectorRegister vHighProduct = VR6;
VectorRegister vReducedLow = VR7;
VectorRegister vTmp8 = VR8;
VectorRegister vTmp9 = VR9;
VectorRegister vTmp10 = VR10;
VectorRegister vSwappedH = VR11;
VectorRegister vTmp12 = VR12;
VectorRegister loadOrder = VR13;
VectorRegister vHigh = VR14;
VectorRegister vLow = VR15;
VectorRegister vState = VR16;
VectorRegister vPerm = VR17;
VectorRegister vCombinedResult = VR18;
VectorRegister vConstC2 = VR19;

__ li(temp1, 0xc2);
__ sldi(temp1, temp1, 56);
__ vspltisb(vZero, 0);

This comment was marked as resolved.

__ mtvrd(vConstC2, temp1);
__ lxvd2x(vH->to_vsr(), subkeyH);
__ lxvd2x(vState->to_vsr(), state);
// Operations to obtain lower and higher bytes of subkey H.
__ vspltisb(vReducedLow, 1);
__ vspltisb(vTmp10, 7);
__ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1
__ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1
__ vsplt(vTmp9, 0, vH); // MSB of H
__ vsl(vH, vH, vReducedLow); // Carry = H<<7
__ vsrab(vTmp9, vTmp9, vTmp10);
__ vand(vTmp9, vTmp9, vTmp8); // Carry
__ vxor(vTmp10, vH, vTmp9);
__ vsldoi(vConstC2, vZero, vConstC2, 8);
__ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H
__ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L
__ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H
#ifdef ASSERT
__ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero
__ asm_assert_ne("blocks should NOT be zero");
#endif
__ clrldi(blocks, blocks, 32);
__ mtctr(blocks);
__ lvsl(loadOrder, temp1);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb(vTmp12, 0xf);
__ vxor(loadOrder, loadOrder, vTmp12);
#define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder)
#else
#define LE_swap_bytes(x)
#endif

// This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
//
// The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
// performing three 128-bit multiplications and combining the results efficiently.
//
// (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
// (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
//
// Inputs:
// - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
// - vLowerH: Lower half of the subkey H (A0).
// - vHigherH: Higher half of the subkey H (A1).
// - vConstC2: Constant used for reduction (for final processing).
//
// References:
// Shay Gueron, Michael E. Kounavis.
// "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
// https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
//
Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
__ andi(temp1, data, 15);
__ cmpwi(CR0, temp1, 0);
__ bne(CR0, L_initialize_unaligned_loop);

__ bind(L_aligned_loop);

This comment was marked as resolved.

__ lvx(vH, temp1, data);
LE_swap_bytes(vH);
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
__ addi(data, data, 16);
__ bdnz(L_aligned_loop);
__ b(L_store);

__ bind(L_initialize_unaligned_loop);
__ li(temp1, 0);
__ lvsl(vPerm, temp1, data);
__ lvx(vHigh, temp1, data);
#ifdef VM_LITTLE_ENDIAN
__ vspltisb(vTmp12, -1);
__ vxor(vPerm, vPerm, vTmp12);
#endif
__ bind(L_unaligned_loop);
__ addi(data, data, 16);
__ lvx(vLow, temp1, data);
__ vec_perm(vH, vHigh, vLow, vPerm);
computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
__ vmr(vHigh, vLow);
__ bdnz(L_unaligned_loop);

__ bind(L_store);
__ stxvd2x(vState->to_vsr(), state);
__ blr();

return start;
}
// -XX:+OptimizeFill : convert fill/copy loops into intrinsic
//
// The code is implemented(ported from sparc) as we believe it benefits JVM98, however
Expand Down Expand Up @@ -5028,6 +5199,10 @@ void generate_lookup_secondary_supers_table_stub() {
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
}

This comment was marked as resolved.


if (UseGHASHIntrinsics) {
StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
}

if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
Expand Down
10 changes: 8 additions & 2 deletions src/hotspot/cpu/ppc/vm_version_ppc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,14 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false);
}

if (UseGHASHIntrinsics) {
warning("GHASH intrinsics are not available on this CPU");
if (VM_Version::has_vsx()) {
if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
UseGHASHIntrinsics = true;
}
} else if (UseGHASHIntrinsics) {
if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
warning("GHASH intrinsics are not available on this CPU");

This comment was marked as resolved.

}
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}

Expand Down