@@ -546,6 +546,177 @@ class StubGenerator: public StubCodeGenerator {
546546 return start;
547547 }
548548
549+ // Computes the Galois/Counter Mode (GCM) product and reduction.
550+ //
551+ // This function performs polynomial multiplication of the subkey H with
552+ // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`).
553+ // The subkey H is divided into lower, middle, and higher halves.
554+ // The multiplication results are reduced using `vConstC2` to stay within GF(2^128).
555+ // The final computed value is stored back into `vState`.
556+ static void computeGCMProduct (MacroAssembler* _masm,
557+ VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH,
558+ VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState,
559+ VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct,
560+ VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9,
561+ VectorRegister vCombinedResult, VectorRegister vSwappedH) {
562+ __ vxor (vH, vH, vState);
563+ __ vpmsumd (vLowProduct, vLowerH, vH); // L : Lower Half of subkey H
564+ __ vpmsumd (vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H
565+ __ vpmsumd (vHighProduct, vHigherH, vH); // H : Higher Half of subkey H
566+ __ vpmsumd (vReducedLow, vLowProduct, vConstC2); // Reduction
567+ __ vsldoi (vTmp8, vMidProduct, vZero, 8 ); // mL : Extract the lower 64 bits of M
568+ __ vsldoi (vTmp9, vZero, vMidProduct, 8 ); // mH : Extract the higher 64 bits of M
569+ __ vxor (vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half
570+ __ vxor (vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half
571+ __ vsldoi (vLowProduct, vLowProduct, vLowProduct, 8 ); // Swap
572+ __ vxor (vLowProduct, vLowProduct, vReducedLow);
573+ __ vsldoi (vCombinedResult, vLowProduct, vLowProduct, 8 ); // Swap
574+ __ vpmsumd (vLowProduct, vLowProduct, vConstC2); // Reduction using constant
575+ __ vxor (vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products
576+ __ vxor (vState, vLowProduct, vCombinedResult);
577+ }
578+
579+ // Generate stub for ghash process blocks.
580+ //
581+ // Arguments for generated stub:
582+ // state: R3_ARG1 (long[] state)
583+ // subkeyH: R4_ARG2 (long[] subH)
584+ // data: R5_ARG3 (byte[] data)
585+ // blocks: R6_ARG4 (number of 16-byte blocks to process)
586+ //
587+ // The polynomials are processed in bit-reflected order for efficiency reasons.
588+ // This optimization leverages the structure of the Galois field arithmetic
589+ // to minimize the number of bit manipulations required during multiplication.
590+ // For an explanation of how this works, refer :
591+ // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich,
592+ // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel®
593+ // Architecture Processor"
594+ // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf
595+ //
596+ //
597+ address generate_ghash_processBlocks () {
598+ StubCodeMark mark (this , " StubRoutines" , " ghash" );
599+ address start = __ function_entry ();
600+
601+ // Registers for parameters
602+ Register state = R3_ARG1; // long[] state
603+ Register subkeyH = R4_ARG2; // long[] subH
604+ Register data = R5_ARG3; // byte[] data
605+ Register blocks = R6_ARG4;
606+ Register temp1 = R8;
607+ // Vector Registers
608+ VectorRegister vZero = VR0;
609+ VectorRegister vH = VR1;
610+ VectorRegister vLowerH = VR2;
611+ VectorRegister vHigherH = VR3;
612+ VectorRegister vLowProduct = VR4;
613+ VectorRegister vMidProduct = VR5;
614+ VectorRegister vHighProduct = VR6;
615+ VectorRegister vReducedLow = VR7;
616+ VectorRegister vTmp8 = VR8;
617+ VectorRegister vTmp9 = VR9;
618+ VectorRegister vTmp10 = VR10;
619+ VectorRegister vSwappedH = VR11;
620+ VectorRegister vTmp12 = VR12;
621+ VectorRegister loadOrder = VR13;
622+ VectorRegister vHigh = VR14;
623+ VectorRegister vLow = VR15;
624+ VectorRegister vState = VR16;
625+ VectorRegister vPerm = VR17;
626+ VectorRegister vCombinedResult = VR18;
627+ VectorRegister vConstC2 = VR19;
628+
629+ __ li (temp1, 0xc2 );
630+ __ sldi (temp1, temp1, 56 );
631+ __ vspltisb (vZero, 0 );
632+ __ mtvrd (vConstC2, temp1);
633+ __ lxvd2x (vH->to_vsr (), subkeyH);
634+ __ lxvd2x (vState->to_vsr (), state);
635+ // Operations to obtain lower and higher bytes of subkey H.
636+ __ vspltisb (vReducedLow, 1 );
637+ __ vspltisb (vTmp10, 7 );
638+ __ vsldoi (vTmp8, vZero, vReducedLow, 1 ); // 0x1
639+ __ vor (vTmp8, vConstC2, vTmp8); // 0xC2...1
640+ __ vsplt (vTmp9, 0 , vH); // MSB of H
641+ __ vsl (vH, vH, vReducedLow); // Carry = H<<7
642+ __ vsrab (vTmp9, vTmp9, vTmp10);
643+ __ vand (vTmp9, vTmp9, vTmp8); // Carry
644+ __ vxor (vTmp10, vH, vTmp9);
645+ __ vsldoi (vConstC2, vZero, vConstC2, 8 );
646+ __ vsldoi (vSwappedH, vTmp10, vTmp10, 8 ); // swap Lower and Higher Halves of subkey H
647+ __ vsldoi (vLowerH, vZero, vSwappedH, 8 ); // H.L
648+ __ vsldoi (vHigherH, vSwappedH, vZero, 8 ); // H.H
649+ #ifdef ASSERT
650+ __ cmpwi (CR0, blocks, 0 ); // Compare 'blocks' (R6_ARG4) with zero
651+ __ asm_assert_ne (" blocks should NOT be zero" );
652+ #endif
653+ __ clrldi (blocks, blocks, 32 );
654+ __ mtctr (blocks);
655+ __ lvsl (loadOrder, temp1);
656+ #ifdef VM_LITTLE_ENDIAN
657+ __ vspltisb (vTmp12, 0xf );
658+ __ vxor (loadOrder, loadOrder, vTmp12);
659+ #define LE_swap_bytes (x ) __ vec_perm (x, x, x, loadOrder)
660+ #else
661+ #define LE_swap_bytes (x )
662+ #endif
663+
664+ // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation.
665+ //
666+ // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts,
667+ // performing three 128-bit multiplications and combining the results efficiently.
668+ //
669+ // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
670+ // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
671+ //
672+ // Inputs:
673+ // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half).
674+ // - vLowerH: Lower half of the subkey H (A0).
675+ // - vHigherH: Higher half of the subkey H (A1).
676+ // - vConstC2: Constant used for reduction (for final processing).
677+ //
678+ // References:
679+ // Shay Gueron, Michael E. Kounavis.
680+ // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode"
681+ // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918
682+ //
683+ Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop;
684+ __ andi (temp1, data, 15 );
685+ __ cmpwi (CR0, temp1, 0 );
686+ __ bne (CR0, L_initialize_unaligned_loop);
687+
688+ __ bind (L_aligned_loop);
689+ __ lvx (vH, temp1, data);
690+ LE_swap_bytes (vH);
691+ computeGCMProduct (_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
692+ vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
693+ __ addi (data, data, 16 );
694+ __ bdnz (L_aligned_loop);
695+ __ b (L_store);
696+
697+ __ bind (L_initialize_unaligned_loop);
698+ __ li (temp1, 0 );
699+ __ lvsl (vPerm, temp1, data);
700+ __ lvx (vHigh, temp1, data);
701+ #ifdef VM_LITTLE_ENDIAN
702+ __ vspltisb (vTmp12, -1 );
703+ __ vxor (vPerm, vPerm, vTmp12);
704+ #endif
705+ __ bind (L_unaligned_loop);
706+ __ addi (data, data, 16 );
707+ __ lvx (vLow, temp1, data);
708+ __ vec_perm (vH, vHigh, vLow, vPerm);
709+ computeGCMProduct (_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState,
710+ vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH);
711+ __ vmr (vHigh, vLow);
712+ __ bdnz (L_unaligned_loop);
713+
714+ __ bind (L_store);
715+ __ stxvd2x (vState->to_vsr (), state);
716+ __ blr ();
717+
718+ return start;
719+ }
549720 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
550721 //
551722 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
@@ -5028,6 +5199,10 @@ void generate_lookup_secondary_supers_table_stub() {
50285199 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync ();
50295200 }
50305201
5202+ if (UseGHASHIntrinsics) {
5203+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks ();
5204+ }
5205+
50315206 if (UseAESIntrinsics) {
50325207 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock ();
50335208 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock ();
0 commit comments