@@ -734,6 +734,177 @@ class StubGenerator: public StubCodeGenerator {
734734 return start;
735735 }
736736
737+ // Computes the Galois/Counter Mode (GCM) product and reduction.
738+ //
739+ // This function performs polynomial multiplication of the subkey H with
740+ // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
741+ // The subkey H is divided into lower, middle, and higher halves.
742+ // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
743+ // The final computed value is stored back into `vState`.
744+ static void computeGCMProduct (MacroAssembler* _masm,
745+ VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
746+ VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
747+ VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
748+ VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
749+ VectorRegister vCombinedResult, VectorRegister vSwappedH) {
750+ __ vxor (vH, vH, vState);
751+ __ vpmsumd (vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
752+ __ vpmsumd (vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
753+ __ vpmsumd (vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
754+ __ vpmsumd (vReducedLow, vLowProduct, vConstC2); // Reduction
755+ __ vsldoi (vTmp8, vMidProduct, vZero, 8 ); // mL : Extract the lower 64 bits of M
756+ __ vsldoi (vTmp9, vZero, vMidProduct, 8 ); // mH : Extract the higher 64 bits of M
757+ __ vxor (vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
758+ __ vxor (vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
759+ __ vsldoi (vLowProduct, vLowProduct, vLowProduct, 8 ); // Swap
760+ __ vxor (vLowProduct, vLowProduct, vReducedLow);
761+ __ vsldoi (vCombinedResult, vLowProduct, vLowProduct, 8 ); // Swap
762+ __ vpmsumd (vLowProduct, vLowProduct, vConstC2); // Reduction using constant
763+ __ vxor (vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
764+ __ vxor (vState, vLowProduct, vCombinedResult);
765+ }
766+
767+ // Generate stub for ghash process blocks.
768+ //
769+ // Arguments for generated stub:
770+ // state: R3_ARG1 (long[] state)
771+ // subkeyH: R4_ARG2 (long[] subH)
772+ // data: R5_ARG3 (byte[] data)
773+ // blocks: R6_ARG4 (number of 16-byte blocks to process)
774+ //
775+ // The polynomials are processed in bit-reflected order for efficiency reasons.
776+ // This optimization leverages the structure of the Galois field arithmetic
777+ // to minimize the number of bit manipulations required during multiplication.
778+ // For an explanation of how this works, refer :
779+ // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
780+ // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
781+ // Architecture Processor"
782+ // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
783+ //
784+ //
785+ address generate_ghash_processBlocks () {
786+ StubCodeMark mark (this , " StubRoutines" , " ghash" );
787+ address start = __ function_entry ();
788+
789+ // Registers for parameters
790+ Register state = R3_ARG1; // long[] state
791+ Register subkeyH = R4_ARG2; // long[] subH
792+ Register data = R5_ARG3; // byte[] data
793+ Register blocks = R6_ARG4;
794+ Register temp1 = R8;
795+ // Vector Registers
796+ VectorRegister vZero = VR0;
797+ VectorRegister vH = VR1;
798+ VectorRegister vLowerH = VR2;
799+ VectorRegister vHigherH = VR3;
800+ VectorRegister vLowProduct = VR4;
801+ VectorRegister vMidProduct = VR5;
802+ VectorRegister vHighProduct = VR6;
803+ VectorRegister vReducedLow = VR7;
804+ VectorRegister vTmp8 = VR8;
805+ VectorRegister vTmp9 = VR9;
806+ VectorRegister vTmp10 = VR10;
807+ VectorRegister vSwappedH = VR11;
808+ VectorRegister vTmp12 = VR12;
809+ VectorRegister loadOrder = VR13;
810+ VectorRegister vHigh = VR14;
811+ VectorRegister vLow = VR15;
812+ VectorRegister vState = VR16;
813+ VectorRegister vPerm = VR17;
814+ VectorRegister vCombinedResult = VR18;
815+ VectorRegister vConstC2 = VR19;
816+
817+ __ li (temp1, 0xc2 );
818+ __ sldi (temp1, temp1, 56 );
819+ __ vspltisb (vZero, 0 );
820+ __ mtvrd (vConstC2, temp1);
821+ __ lxvd2x (vH->to_vsr (), subkeyH);
822+ __ lxvd2x (vState->to_vsr (), state);
823+ // Operations to obtain lower and higher bytes of subkey H.
824+ __ vspltisb (vReducedLow, 1 );
825+ __ vspltisb (vTmp10, 7 );
826+ __ vsldoi (vTmp8, vZero, vReducedLow, 1 ); // 0x1
827+ __ vor (vTmp8, vConstC2, vTmp8); // 0xC2...1
828+ __ vsplt (vTmp9, 0 , vH); // MSB of H
829+ __ vsl (vH, vH, vReducedLow); // Carry = H<<7
830+ __ vsrab (vTmp9, vTmp9, vTmp10);
831+ __ vand (vTmp9, vTmp9, vTmp8); // Carry
832+ __ vxor (vTmp10, vH, vTmp9);
833+ __ vsldoi (vConstC2, vZero, vConstC2, 8 );
834+ __ vsldoi (vSwappedH, vTmp10, vTmp10, 8 ); // swap Lower and Higher Halves of subkey H
835+ __ vsldoi (vLowerH, vZero, vSwappedH, 8 ); // H.L
836+ __ vsldoi (vHigherH, vSwappedH, vZero, 8 ); // H.H
837+ #ifdef ASSERT
838+ __ cmpwi (CCR0, blocks, 0 ); // Compare 'blocks' (R6_ARG4) with zero
839+ __ asm_assert_ne (" blocks should NOT be zero" );
840+ #endif
841+ __ clrldi (blocks, blocks, 32 );
842+ __ mtctr (blocks);
843+ __ lvsl (loadOrder, temp1);
844+ #ifdef VM_LITTLE_ENDIAN
845+ __ vspltisb (vTmp12, 0xf );
846+ __ vxor (loadOrder, loadOrder, vTmp12);
847+ #define LE_swap_bytes (x ) __ vec_perm (x, x, x, loadOrder)
848+ #else
849+ #define LE_swap_bytes (x )
850+ #endif
851+
852+ // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
853+ //
854+ // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
855+ // performing three 128-bit multiplications and combining the results efficiently.
856+ //
857+ // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
858+ // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
859+ //
860+ // Inputs:
861+ // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
862+ // - vLowerH: Lower half of the subkey H (A0).
863+ // - vHigherH: Higher half of the subkey H (A1).
864+ // - vConstC2: Constant used for reduction (for final processing).
865+ //
866+ // References:
867+ // Shay Gueron, Michael E. Kounavis.
868+ // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
869+ // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
870+ //
871+ Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
872+ __ andi (temp1, data, 15 );
873+ __ cmpwi (CCR0, temp1, 0 );
874+ __ bne (CCR0, L_initialize_unaligned_loop);
875+
876+ __ bind (L_aligned_loop);
877+ __ lvx (vH, temp1, data);
878+ LE_swap_bytes (vH);
879+ computeGCMProduct (_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
880+ vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
881+ __ addi (data, data, 16 );
882+ __ bdnz (L_aligned_loop);
883+ __ b (L_store);
884+
885+ __ bind (L_initialize_unaligned_loop);
886+ __ li (temp1, 0 );
887+ __ lvsl (vPerm, temp1, data);
888+ __ lvx (vHigh, temp1, data);
889+ #ifdef VM_LITTLE_ENDIAN
890+ __ vspltisb (vTmp12, -1 );
891+ __ vxor (vPerm, vPerm, vTmp12);
892+ #endif
893+ __ bind (L_unaligned_loop);
894+ __ addi (data, data, 16 );
895+ __ lvx (vLow, temp1, data);
896+ __ vec_perm (vH, vHigh, vLow, vPerm);
897+ computeGCMProduct (_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
898+ vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
899+ __ vmr (vHigh, vLow);
900+ __ bdnz (L_unaligned_loop);
901+
902+ __ bind (L_store);
903+ __ stxvd2x (vState->to_vsr (), state);
904+ __ blr ();
905+
906+ return start;
907+ }
737908 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
738909 //
739910 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
@@ -4851,6 +5022,10 @@ class StubGenerator: public StubCodeGenerator {
48515022 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync ();
48525023 }
48535024
5025+ if (UseGHASHIntrinsics) {
5026+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks ();
5027+ }
5028+
48545029 if (UseAESIntrinsics) {
48555030 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock ();
48565031 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock ();
0 commit comments