From 662d3d8d693e1d16fbe3575d7ac9e8225b7e61a6 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 18 Jul 2024 09:31:20 -0500 Subject: [PATCH 01/56] skeleton code --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 10 ++++++++++ src/hotspot/cpu/ppc/vm_version_ppc.cpp | 9 ++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index f32e62560721f..cb9ac1d77049a 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -629,7 +629,14 @@ class StubGenerator: public StubCodeGenerator { return start; } +address generate_ghash_processBlocks() { + StubCodeMark mark(this, "StubRoutines", name); + address start = __ function_entry(); + return start; + + +} // -XX:+OptimizeFill : convert fill/copy loops into intrinsic // // The code is implemented(ported from sparc) as we believe it benefits JVM98, however @@ -4767,6 +4774,9 @@ address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); } + if (UseGHASHIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } if (UseAESIntrinsics) { StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index 6aada789c1daf..770dffccfd332 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -305,10 +305,13 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (UseGHASHIntrinsics) { - warning("GHASH intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); } + // if (UseGHASHIntrinsics && !has_Crypto_GHASH()) { + // warning("GHASH intrinsics are not available on this CPU"); + // FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + // } if (FLAG_IS_DEFAULT(UseFMA)) { FLAG_SET_DEFAULT(UseFMA, true); From 209f2e55cf606f3f254a6a98eb3779aad2543fbd Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Fri, 19 Jul 2024 09:40:11 -0500 Subject: [PATCH 02/56] add instructions for gcm shift --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 60 ++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index cb9ac1d77049a..9f004ab3ed982 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -630,8 +630,66 @@ class StubGenerator: public StubCodeGenerator { return start; } address generate_ghash_processBlocks() { - StubCodeMark mark(this, "StubRoutines", name); + StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); + Register data = R3_ARG1; // byte[] data + Register ofs = R4_ARG2; // int ofs + Register state = R5_ARG3; // long[] st + Register subkeyH = R6_ARG4; // long[] subH + + // Temporary registers + Register temp1 = R8; + Register temp2 = R9; + Register temp3 = R10; + Register temp4 = R11; + +VectorRegister vH = VR0; + VectorRegister vX = VR1; + VectorRegister vH_shift = VR2; + VectorRegister vTmp1 = VR3; + VectorRegister vTmp2 = VR4; + VectorRegister vTmp3 = VR5; + VectorRegister vTmp4 = VR6; + VectorRegister vResult = VR7; + VectorRegister vCarry = VR8; + VectorRegister vPerm = VR9; + + // const unsigned char constC2[16] = {0xc2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + // Load the address of the constant array into a register + + + __ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); + // Load the vector from memory into vConstC2 + __ lvx(vX,temp1); + __ vxor(vTmp1, vTmp1, vTmp1); + + // Load H into vector registers + __ lxvd2x(32 + 1, 0, subkeyH); // Load H + __ vspltisb(vTmp2, 1); + __ vspltisb(vTmp3, 7); + __ vsldoi(vTmp4, vTmp1, vTmp2, 1); + __ vor(vPerm, vCarry, vTmp4); + __ vsplt(vTmp1, 0, vH); + __ vsl(vH_shift, vH, vTmp2); + __ vsrab(vTmp1, vTmp1, vTmp3); + __ vand(vTmp1, vTmp1, vPerm); + __ vxor(vTmp2, vH_shift, vTmp1); + + __ vsldoi(vCarry, vTmp1, vPerm, 8); + __ vsldoi(vTmp3, vTmp2, vTmp2, 8); + __ vsldoi(vTmp1, vTmp1, vTmp3, 8); + __ vsldoi(vTmp4, vTmp3, vTmp1, 8); + + // Store shifted H + __ stvx(vCarry, 0, temp2); + __ stvx(vTmp1, 16, temp2); + __ stvx(vTmp3, 32, temp2); + __ stvx(vTmp4, 48, temp2); + // if(UseNewCode){ + // __ unimplemented("ghash"); + // } return start; From ea99cc82c0056aa233d4958ff8c8158d02174993 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 22 Jul 2024 11:03:14 -0500 Subject: [PATCH 03/56] further code --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 72 +++++++++++++++++++++-- 1 file changed, 67 insertions(+), 5 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 9f004ab3ed982..e2488b7ee3b03 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -633,9 +633,13 @@ address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); Register data = R3_ARG1; // byte[] data - Register ofs = R4_ARG2; // int ofs - Register state = R5_ARG3; // long[] st - Register subkeyH = R6_ARG4; // long[] subH + Register ofs = R4_ARG2; // int ofs + Register blocks = R5_ARG3; // int blocks + Register state = R6_ARG4; // long[] st + Register subkeyH = R7_ARG5; // long[] subH + + + // Temporary registers Register temp1 = R8; @@ -658,11 +662,11 @@ VectorRegister vH = VR0; // Load the address of the constant array into a register - + VectorRegister vConstC2 = VR10; __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 - __ lvx(vX,temp1); + __ mtvrd(vConstC2, temp1); __ vxor(vTmp1, vTmp1, vTmp1); // Load H into vector registers @@ -687,6 +691,64 @@ VectorRegister vH = VR0; __ stvx(vTmp1, 16, temp2); __ stvx(vTmp3, 32, temp2); __ stvx(vTmp4, 48, temp2); + + // VectorSRegister vCarryS = VSR0; // Create a scalar vector register for mtvsrd + + __ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); + __ mtvrd(vConstC2, temp1); // Use VectorSRegister for mtvsrd + __ vxor(vTmp1, vTmp1, vTmp1); + + // Load H into vector registers + __ li(temp1, 16); + __ lxvd2x(32 + 1, temp1, subkeyH); // Load Hl + __ li(temp1, 32); + __ lxvd2x(2 + 32, temp1, subkeyH); // Load H + __ li(temp1, 48); + __ lxvd2x(11 + 32, temp1, subkeyH); // Load Hh + + __ vxor(vH, vH, vH); + + // Calculate the number of blocks + __ li(temp1, 16); + __ divdu(temp2, blocks, temp1); + __ mtctr(temp2); + __ li(temp3, 0); + + Label loop; + __ bind(loop); + + // Load input data + // __ lxvb16x(32 + 1, temp3, data); + __ lvx(vX,temp1); + __ addi(temp3, temp3, 16); + + // Perform GCM multiplication + __ vpmsumd(vTmp1, vH_shift, vX); // L + __ vpmsumd(vTmp2, vH, vX); // M + __ vpmsumd(vTmp3, vH_shift, vX); // H + + __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction + + __ vsldoi(vTmp1, vTmp2, vH, 8); // mL + __ vsldoi(vTmp2, vH, vTmp2, 8); // mH + __ vxor(vTmp1, vTmp1, vTmp1); // LL + LL + __ vxor(vTmp3, vTmp3, vTmp2); // HH + HH + + __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap + __ vxor(vTmp1, vTmp1, vTmp4); // reduction + + __ vsldoi(vTmp4, vTmp1, vTmp1, 8); // swap + __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction + __ vxor(vTmp4, vTmp4, vTmp3); + __ vxor(vH, vTmp1, vTmp4); + + __ bdnz(loop); + // __ stxv(vH, state, temp4); + __ blr(); // Return from function + + + // if(UseNewCode){ // __ unimplemented("ghash"); // } From ea4d59afd1d6c691d36e7fdf26d120dc3e619a99 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 3 Sep 2024 07:20:43 -0500 Subject: [PATCH 04/56] ghash --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 84 ++++++++++++----------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index e2488b7ee3b03..2a8bd495cf4ac 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -629,6 +629,7 @@ class StubGenerator: public StubCodeGenerator { return start; } +int fubar=0; address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); @@ -646,8 +647,10 @@ address generate_ghash_processBlocks() { Register temp2 = R9; Register temp3 = R10; Register temp4 = R11; + Register fubar_addr = R12; + Register fubar_value = R13; -VectorRegister vH = VR0; + VectorRegister vH = VR0; VectorRegister vX = VR1; VectorRegister vH_shift = VR2; VectorRegister vTmp1 = VR3; @@ -655,58 +658,63 @@ VectorRegister vH = VR0; VectorRegister vTmp3 = VR5; VectorRegister vTmp4 = VR6; VectorRegister vResult = VR7; - VectorRegister vCarry = VR8; - VectorRegister vPerm = VR9; + VectorRegister vMSB = VR8; + VectorRegister vLowerH = VR9; + VectorRegister vHigherH = VR10; + VectorRegister vZero = VR11; + VectorRegister vConst1 = VR12; + VectorRegister vConst7 = VR13; - // const unsigned char constC2[16] = {0xc2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - // Load the address of the constant array into a register - VectorRegister vConstC2 = VR10; + VectorRegister vConstC2 = VR10; + __ li(temp1, 0x2); + __ li(temp1, 0x3); + __ li(temp1, 0xc4); + __ li(temp1, 0x5); __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); + // Load the vector from memory into vConstC2 __ mtvrd(vConstC2, temp1); - __ vxor(vTmp1, vTmp1, vTmp1); + __ vxor(vZero, vZero, vZero); // Load H into vector registers - __ lxvd2x(32 + 1, 0, subkeyH); // Load H - __ vspltisb(vTmp2, 1); - __ vspltisb(vTmp3, 7); - __ vsldoi(vTmp4, vTmp1, vTmp2, 1); - __ vor(vPerm, vCarry, vTmp4); - __ vsplt(vTmp1, 0, vH); - __ vsl(vH_shift, vH, vTmp2); - __ vsrab(vTmp1, vTmp1, vTmp3); - __ vand(vTmp1, vTmp1, vPerm); - __ vxor(vTmp2, vH_shift, vTmp1); + // Use a different register (e.g., R3) + + __ li(temp1, 0); // Load immediate value 0 into temp + __ lvx(vH, temp1, subkeyH); // Load H using temp instead of R0 - __ vsldoi(vCarry, vTmp1, vPerm, 8); + __ vspltisb(vConst1, 1); + __ vsldoi(vTmp4, vZero, vConst1, 1); + __ vor(vTmp4, vConstC2, vTmp4); + __ vsplt(vMSB, 0, vH); + __ vsl(vH_shift, vH, vConst7); + __ vsrab(vMSB, vMSB, vConst7); + __ vand(vMSB, vMSB, vTmp4); + __ vxor(vTmp2, vH_shift, vMSB); + __ vsldoi(vTmp3, vTmp2, vTmp2, 8); - __ vsldoi(vTmp1, vTmp1, vTmp3, 8); - __ vsldoi(vTmp4, vTmp3, vTmp1, 8); + __ vsldoi(vLowerH, vZero, vTmp3, 8); + __ vsldoi(vHigherH, vTmp3, vZero, 8); + + __ load_const_optimized(fubar_addr, (uintptr_t)&fubar); + __ ld(fubar_value, 0, fubar_addr); + __ addi(fubar_value, fubar_value, 1); + __ std(fubar_value, 0, fubar_addr); + __ unimplemented("ghash"); + - // Store shifted H - __ stvx(vCarry, 0, temp2); - __ stvx(vTmp1, 16, temp2); - __ stvx(vTmp3, 32, temp2); - __ stvx(vTmp4, 48, temp2); +/* + // Store shifted // VectorSRegister vCarryS = VSR0; // Create a scalar vector register for mtvsrd __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); - __ mtvrd(vConstC2, temp1); // Use VectorSRegister for mtvsrd + __ mtvrd(vConstC2, temp1); // Use VectorSRegister for mtvsrd __ vxor(vTmp1, vTmp1, vTmp1); - // Load H into vector registers - __ li(temp1, 16); - __ lxvd2x(32 + 1, temp1, subkeyH); // Load Hl - __ li(temp1, 32); - __ lxvd2x(2 + 32, temp1, subkeyH); // Load H - __ li(temp1, 48); - __ lxvd2x(11 + 32, temp1, subkeyH); // Load Hh - __ vxor(vH, vH, vH); // Calculate the number of blocks @@ -720,7 +728,7 @@ VectorRegister vH = VR0; // Load input data // __ lxvb16x(32 + 1, temp3, data); - __ lvx(vX,temp1); + __ lvx(vX,temp1); __ addi(temp3, temp3, 16); // Perform GCM multiplication @@ -746,13 +754,11 @@ VectorRegister vH = VR0; __ bdnz(loop); // __ stxv(vH, state, temp4); __ blr(); // Return from function + +*/ - // if(UseNewCode){ - // __ unimplemented("ghash"); - // } - return start; From a37cb6fea3f339c30dde590b195922acd807f3e2 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 8 Oct 2024 09:01:58 -0500 Subject: [PATCH 05/56] chnage byte order --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 141 +++++++++++----------- 1 file changed, 68 insertions(+), 73 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 2a8bd495cf4ac..80ddc8140f15a 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -69,6 +69,7 @@ #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry() #endif + int fubar=0; class StubGenerator: public StubCodeGenerator { private: @@ -629,84 +630,80 @@ class StubGenerator: public StubCodeGenerator { return start; } -int fubar=0; address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); - Register data = R3_ARG1; // byte[] data - Register ofs = R4_ARG2; // int ofs - Register blocks = R5_ARG3; // int blocks - Register state = R6_ARG4; // long[] st - Register subkeyH = R7_ARG5; // long[] subH - - - - + + // Register ofs = R4_ARG2; // int ofs + // int blocks + Register state = R3_ARG1; // long[] st + Register subkeyH = R4_ARG2; + Register data = R5_ARG3; // byte[] data // long[] subH + Register blocks = R6_ARG4; + __ stop("ghash start"); + // Temporary registers - Register temp1 = R8; - Register temp2 = R9; - Register temp3 = R10; - Register temp4 = R11; - Register fubar_addr = R12; - Register fubar_value = R13; - - VectorRegister vH = VR0; - VectorRegister vX = VR1; - VectorRegister vH_shift = VR2; - VectorRegister vTmp1 = VR3; - VectorRegister vTmp2 = VR4; - VectorRegister vTmp3 = VR5; - VectorRegister vTmp4 = VR6; - VectorRegister vResult = VR7; - VectorRegister vMSB = VR8; - VectorRegister vLowerH = VR9; - VectorRegister vHigherH = VR10; - VectorRegister vZero = VR11; - VectorRegister vConst1 = VR12; - VectorRegister vConst7 = VR13; - - - - VectorRegister vConstC2 = VR10; - __ li(temp1, 0x2); - __ li(temp1, 0x3); - __ li(temp1, 0xc4); - __ li(temp1, 0x5); - __ li(temp1, 0xc2); - __ sldi(temp1, temp1, 56); + Register temp1 = R8; + Register temp2 = R9; + Register temp3 = R10; + Register temp4 = R11; + Register fubar_addr = R12; + Register fubar_value = R13; + VectorRegister vH = VR0; + VectorRegister vX = VR1; + VectorRegister vH_shift = VR2; + VectorRegister vTmp1 = VR3; + VectorRegister vTmp2 = VR4; + VectorRegister vTmp3 = VR5; + VectorRegister vTmp4 = VR6; + VectorRegister vResult = VR7; + VectorRegister vMSB = VR8; + VectorRegister vLowerH = VR9; + VectorRegister vHigherH = VR10; + VectorRegister vZero = VR11; + VectorRegister vConst1 = VR12; + VectorRegister vConst7 = VR13; + VectorRegister vConstC2 = VR10; + VectorRegister fromPerm = VR15; + + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + +// Load the address of perm_pattern +__ load_const_optimized(temp1, (uintptr_t)&perm_pattern); + +// Load the 128-bit vector from memory +__ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register +__ lvx(fromPerm, temp1); // Lo + __ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); + // Load the vector from memory into vConstC2 + __ vxor(vConstC2,vConstC2,vConstC2); + __ mtvrd(vConstC2, temp1); + __ vxor(vZero, vZero, vZero); + // Load H into vector registsiers + // Use a different register (e.g., R3) + __ li(temp1, 0); // Load immediate value 0 into temp + __ vxor(vH,vH,vH); + __ lvx(vH, temp1, subkeyH); // Load H using temp instead of R0 + __ vec_perm(vH, vH, vH, fromPerm); + __ vspltisb(vConst1, 1); // Vector with 1s + __ vspltisb(vConst7, 7); // Vector with 7s + + __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 + __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 + __ vsplt(vMSB, 0, vH); // MSB of H + __ vxor(vH_shift, vH_shift,vH_shift); + __ vsl(vH_shift, vH, vConst7); // Carry= H<<7 + __ vsrab(vMSB, vMSB, vConst7); + __ vand(vMSB, vMSB, vTmp4); //Carry + __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 + + __ vsldoi(vTmp3, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vLowerH, vZero, vTmp3, 8); //H.L + __ vsldoi(vHigherH, vTmp3, vZero, 8); //H.H - // Load the vector from memory into vConstC2 - __ mtvrd(vConstC2, temp1); - __ vxor(vZero, vZero, vZero); - - // Load H into vector registers - // Use a different register (e.g., R3) - - __ li(temp1, 0); // Load immediate value 0 into temp - __ lvx(vH, temp1, subkeyH); // Load H using temp instead of R0 - - __ vspltisb(vConst1, 1); - __ vsldoi(vTmp4, vZero, vConst1, 1); - __ vor(vTmp4, vConstC2, vTmp4); - __ vsplt(vMSB, 0, vH); - __ vsl(vH_shift, vH, vConst7); - __ vsrab(vMSB, vMSB, vConst7); - __ vand(vMSB, vMSB, vTmp4); - __ vxor(vTmp2, vH_shift, vMSB); - - __ vsldoi(vTmp3, vTmp2, vTmp2, 8); - __ vsldoi(vLowerH, vZero, vTmp3, 8); - __ vsldoi(vHigherH, vTmp3, vZero, 8); - - __ load_const_optimized(fubar_addr, (uintptr_t)&fubar); - __ ld(fubar_value, 0, fubar_addr); - __ addi(fubar_value, fubar_value, 1); - __ std(fubar_value, 0, fubar_addr); - __ unimplemented("ghash"); - -/* // Store shifted // VectorSRegister vCarryS = VSR0; // Create a scalar vector register for mtvsrd @@ -756,8 +753,6 @@ address generate_ghash_processBlocks() { __ blr(); // Return from function -*/ - return start; From cbf988356c3e79294e3ea637ed377f7dead5fffd Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 17 Oct 2024 02:07:49 -0500 Subject: [PATCH 06/56] reduction phase --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 67 +++++++++++------------ 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 80ddc8140f15a..0be2d38945b1f 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -654,7 +654,7 @@ address generate_ghash_processBlocks() { VectorRegister vH_shift = VR2; VectorRegister vTmp1 = VR3; VectorRegister vTmp2 = VR4; - VectorRegister vTmp3 = VR5; + VectorRegister vSwappedH = VR5; VectorRegister vTmp4 = VR6; VectorRegister vResult = VR7; VectorRegister vMSB = VR8; @@ -665,6 +665,11 @@ address generate_ghash_processBlocks() { VectorRegister vConst7 = VR13; VectorRegister vConstC2 = VR10; VectorRegister fromPerm = VR15; + VectorRegister vTmp3 = VR16; + VectorRegister vTmp5 = VR17; + VectorRegister vTmp6 = VR18; + VectorRegister vTmp7 = VR19; + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; @@ -693,62 +698,52 @@ __ lvx(fromPerm, temp1); // Lo __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 __ vsplt(vMSB, 0, vH); // MSB of H __ vxor(vH_shift, vH_shift,vH_shift); - __ vsl(vH_shift, vH, vConst7); // Carry= H<<7 + __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); //Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 - __ vsldoi(vTmp3, vTmp2, vTmp2, 8); // swap L,H - __ vsldoi(vLowerH, vZero, vTmp3, 8); //H.L - __ vsldoi(vHigherH, vTmp3, vZero, 8); //H.H - - + __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L + __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H - // Store shifted - // VectorSRegister vCarryS = VSR0; // Create a scalar vector register for mtvsrd - - __ li(temp1, 0xc2); - __ sldi(temp1, temp1, 56); - __ mtvrd(vConstC2, temp1); // Use VectorSRegister for mtvsrd __ vxor(vTmp1, vTmp1, vTmp1); - - __ vxor(vH, vH, vH); + __ vxor(vZero, vZero, vZero); // Calculate the number of blocks - __ li(temp1, 16); - __ divdu(temp2, blocks, temp1); - __ mtctr(temp2); - __ li(temp3, 0); + __ mtctr(blocks); + __ li(temp1, 0); Label loop; __ bind(loop); - - // Load input data - // __ lxvb16x(32 + 1, temp3, data); - __ lvx(vX,temp1); - __ addi(temp3, temp3, 16); + + // Load immediate value 0 into temp + __ vxor(vX,vX,vX); + __ lvx(vX, temp1, data); + // __ vec_perm(vX, vX, vX, fromPerm); + __ addi(temp1, temp1, 16); // Perform GCM multiplication - __ vpmsumd(vTmp1, vH_shift, vX); // L - __ vpmsumd(vTmp2, vH, vX); // M - __ vpmsumd(vTmp3, vH_shift, vX); // H - + __ vpmsumd(vTmp1, vLowerH, vX); // L + __ vpmsumd(vTmp2, vSwappedH, vX); // M + __ vpmsumd(vTmp3, vHigherH, vX); // H __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction - __ vsldoi(vTmp1, vTmp2, vH, 8); // mL - __ vsldoi(vTmp2, vH, vTmp2, 8); // mH - __ vxor(vTmp1, vTmp1, vTmp1); // LL + LL - __ vxor(vTmp3, vTmp3, vTmp2); // HH + HH + __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL + __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH + + __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL + __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap __ vxor(vTmp1, vTmp1, vTmp4); // reduction - __ vsldoi(vTmp4, vTmp1, vTmp1, 8); // swap + __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction - __ vxor(vTmp4, vTmp4, vTmp3); - __ vxor(vH, vTmp1, vTmp4); - + __ vxor(vTmp7, vTmp7, vTmp3); + __ vxor(vZero, vTmp1, vTmp7); __ bdnz(loop); + // __ stxv(vH, state, temp4); __ blr(); // Return from function From 5e5a9bdd42af0f0b6b76eadd484969c436c883c4 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 17 Oct 2024 03:54:52 -0500 Subject: [PATCH 07/56] reduction phase --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 0be2d38945b1f..98ad7b0d3c21c 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -663,7 +663,7 @@ address generate_ghash_processBlocks() { VectorRegister vZero = VR11; VectorRegister vConst1 = VR12; VectorRegister vConst7 = VR13; - VectorRegister vConstC2 = VR10; + VectorRegister vConstC2 = VR14; VectorRegister fromPerm = VR15; VectorRegister vTmp3 = VR16; VectorRegister vTmp5 = VR17; @@ -672,15 +672,16 @@ address generate_ghash_processBlocks() { static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; - +static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; // Load the address of perm_pattern __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); +__ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); // Load the 128-bit vector from memory __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register __ lvx(fromPerm, temp1); // Lo - __ li(temp1, 0xc2); - __ sldi(temp1, temp1, 56); +__ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 __ vxor(vConstC2,vConstC2,vConstC2); __ mtvrd(vConstC2, temp1); @@ -702,7 +703,8 @@ __ lvx(fromPerm, temp1); // Lo __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); //Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 - + // vsldoi 19, 0, 18, 8 + __ vsldoi(vConstC2, vZero, vConstC2, 8); __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H @@ -711,7 +713,7 @@ __ lvx(fromPerm, temp1); // Lo __ vxor(vZero, vZero, vZero); // Calculate the number of blocks - + __ lvx(fromPerm, temp2); __ mtctr(blocks); __ li(temp1, 0); Label loop; @@ -720,6 +722,7 @@ __ lvx(fromPerm, temp1); // Lo // Load immediate value 0 into temp __ vxor(vX,vX,vX); __ lvx(vX, temp1, data); + __ vec_perm(vX, vX, vX, fromPerm); // __ vec_perm(vX, vX, vX, fromPerm); __ addi(temp1, temp1, 16); From 775e719280dbff41d3d203e2c8f07dda12e4ea2c Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 21 Oct 2024 04:49:57 -0500 Subject: [PATCH 08/56] reduction phase --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 98ad7b0d3c21c..fa20e9ea0366f 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -746,8 +746,8 @@ __ li(temp1, 0xc2); __ vxor(vTmp7, vTmp7, vTmp3); __ vxor(vZero, vTmp1, vTmp7); __ bdnz(loop); - - // __ stxv(vH, state, temp4); + __ li(temp4, 0); + __ stvx(vZero, temp4, state); __ blr(); // Return from function From 5ff72222eb10d29244f08ce7b9afdc77494e4772 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Fri, 25 Oct 2024 04:40:03 -0500 Subject: [PATCH 09/56] alignment issues --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 61 +++++++++++++++++++---- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index fa20e9ea0366f..bba4ad7798c08 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -640,13 +640,14 @@ address generate_ghash_processBlocks() { Register subkeyH = R4_ARG2; Register data = R5_ARG3; // byte[] data // long[] subH Register blocks = R6_ARG4; - __ stop("ghash start"); + // __ stop("ghash start"); // Temporary registers Register temp1 = R8; Register temp2 = R9; Register temp3 = R10; Register temp4 = R11; + Register align = data; Register fubar_addr = R12; Register fubar_value = R13; VectorRegister vH = VR0; @@ -669,17 +670,23 @@ address generate_ghash_processBlocks() { VectorRegister vTmp5 = VR17; VectorRegister vTmp6 = VR18; VectorRegister vTmp7 = VR19; + VectorRegister vHigh = VR20; + VectorRegister vLow = VR21; + VectorRegister vPerm = VR22; + Label L_end, L_aligned, L_align_2,L_end_2; - static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + + +static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; // Load the address of perm_pattern __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); - +__ li(temp3,0); // Load the 128-bit vector from memory __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register -__ lvx(fromPerm, temp1); // Lo +__ lvxl(fromPerm, temp3, temp1); // Lo __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 @@ -688,10 +695,27 @@ __ li(temp1, 0xc2); __ vxor(vZero, vZero, vZero); // Load H into vector registsiers // Use a different register (e.g., R3) + __ li(temp1, 0); + __ andi(temp1, subkeyH, 15); + __ cmpwi(CCR0,temp1,0); + __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) __ li(temp1, 0); // Load immediate value 0 into temp __ vxor(vH,vH,vH); - __ lvx(vH, temp1, subkeyH); // Load H using temp instead of R0 + + + __ lvx(vHigh, temp1, subkeyH); // Load H using temp instead of R0 + __ lvsl(vPerm,temp1,subkeyH); + __ addi(subkeyH,subkeyH,16); + __ lvx(vLow,temp1,subkeyH); + __ vec_perm(vH, vHigh, vLow, vPerm); + __ subi(subkeyH,subkeyH,16); + + __ b(L_end); + __ bind(L_aligned); + __ lvx(vH,temp1,subkeyH); + __ bind(L_end); __ vec_perm(vH, vH, vH, fromPerm); + __ vspltisb(vConst1, 1); // Vector with 1s __ vspltisb(vConst7, 7); // Vector with 7s @@ -713,17 +737,34 @@ __ li(temp1, 0xc2); __ vxor(vZero, vZero, vZero); // Calculate the number of blocks - __ lvx(fromPerm, temp2); + __ mtctr(blocks); __ li(temp1, 0); + __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); Label loop; __ bind(loop); // Load immediate value 0 into temp __ vxor(vX,vX,vX); - __ lvx(vX, temp1, data); + __ li(temp1,0); + + __ andi(temp1, data, 15); + __ cmpwi(CCR0,temp1,0); + __ beq(CCR0, L_align_2); + __ li(temp1,0); + __ lvx(vHigh,temp1,align); + __ lvsl(fromPerm,temp1,align); + __ addi(align,align,16); + __ lvx(vLow,temp1,data); + __ vec_perm(vX, vHigh, vLow, fromPerm); + __ subi(align,align,16); + __ b(L_end_2); + __ bind(L_align_2); + __ lvx(vX,temp1,data); + + __ bind(L_end_2); + __ lvx(fromPerm, temp2); __ vec_perm(vX, vX, vX, fromPerm); - // __ vec_perm(vX, vX, vX, fromPerm); __ addi(temp1, temp1, 16); // Perform GCM multiplication @@ -749,10 +790,8 @@ __ li(temp1, 0xc2); __ li(temp4, 0); __ stvx(vZero, temp4, state); __ blr(); // Return from function - - - return start; + } From fb5c9466b66727ab65fd78ab1ec993be76524a68 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 11 Nov 2024 07:10:47 -0600 Subject: [PATCH 10/56] alginment for load and store --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 121 +++++++++++++--------- 1 file changed, 72 insertions(+), 49 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index bba4ad7798c08..36518942cf6d6 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -69,7 +69,6 @@ #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name)->entry() #endif - int fubar=0; class StubGenerator: public StubCodeGenerator { private: @@ -633,23 +632,17 @@ class StubGenerator: public StubCodeGenerator { address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); - - // Register ofs = R4_ARG2; // int ofs - // int blocks Register state = R3_ARG1; // long[] st - Register subkeyH = R4_ARG2; - Register data = R5_ARG3; // byte[] data // long[] subH + Register subkeyH = R4_ARG2; // long[] subH + Register data = R5_ARG3; // byte[] data Register blocks = R6_ARG4; // __ stop("ghash start"); - // Temporary registers Register temp1 = R8; Register temp2 = R9; Register temp3 = R10; Register temp4 = R11; Register align = data; - Register fubar_addr = R12; - Register fubar_value = R13; VectorRegister vH = VR0; VectorRegister vX = VR1; VectorRegister vH_shift = VR2; @@ -673,52 +666,62 @@ address generate_ghash_processBlocks() { VectorRegister vHigh = VR20; VectorRegister vLow = VR21; VectorRegister vPerm = VR22; - - Label L_end, L_aligned, L_align_2,L_end_2; - - - -static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; -static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + VectorRegister vZero_Stored = VR23; + VectorRegister vMask = VR24; + VectorRegister vS = VR25; + Label L_end, L_aligned, L_align_2,L_end_2,L_aligned3,L_end3,L_aligned4,L_end4; + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; // Load the address of perm_pattern -__ load_const_optimized(temp1, (uintptr_t)&perm_pattern); -__ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); -__ li(temp3,0); -// Load the 128-bit vector from memory -__ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register -__ lvxl(fromPerm, temp3, temp1); // Lo -__ li(temp1, 0xc2); - __ sldi(temp1, temp1, 56); + __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); + __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); + __ li(temp3,0); + __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register + __ lvxl(fromPerm, temp3, temp1); // Lo + __ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 __ vxor(vConstC2,vConstC2,vConstC2); __ mtvrd(vConstC2, temp1); __ vxor(vZero, vZero, vZero); // Load H into vector registsiers // Use a different register (e.g., R3) - __ li(temp1, 0); + __ li(temp1, 0); __ andi(temp1, subkeyH, 15); __ cmpwi(CCR0,temp1,0); __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) __ li(temp1, 0); // Load immediate value 0 into temp __ vxor(vH,vH,vH); - - __ lvx(vHigh, temp1, subkeyH); // Load H using temp instead of R0 __ lvsl(vPerm,temp1,subkeyH); __ addi(subkeyH,subkeyH,16); __ lvx(vLow,temp1,subkeyH); __ vec_perm(vH, vHigh, vLow, vPerm); __ subi(subkeyH,subkeyH,16); - __ b(L_end); __ bind(L_aligned); __ lvx(vH,temp1,subkeyH); __ bind(L_end); __ vec_perm(vH, vH, vH, fromPerm); - - __ vspltisb(vConst1, 1); // Vector with 1s - __ vspltisb(vConst7, 7); // Vector with 7s - + __ li(temp1, 0); + __ andi(temp1, state, 15); + __ cmpwi(CCR0,temp1,0); + __ beq(CCR0, L_aligned3);// Check if 'to' is aligned (mask lower 4 bits) + __ li(temp1, 0); // Load immediate value 0 into temp + __ vxor(vZero_Stored,vZero_Stored,vZero_Stored); + __ lvx(vHigh, temp1, state); // Load H using temp instead of R0 + __ lvsl(vPerm,temp1,state); + __ addi(state, state, 16); + __ lvx(vLow, temp1, state); + __ vec_perm(vZero_Stored, vHigh, vLow, vPerm); + __ subi(state,state,16); + __ b(L_end3); + __ bind(L_aligned3); + __ lvx(vZero_Stored,temp1,state); + __ bind(L_end3); + __ vec_perm(vZero_Stored, vZero_Stored, vZero_Stored, fromPerm); + __ vspltisb(vConst1, 1); + __ vspltisb(vConst7, 7); __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 __ vsplt(vMSB, 0, vH); // MSB of H @@ -727,30 +730,29 @@ __ li(temp1, 0xc2); __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); //Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 - // vsldoi 19, 0, 18, 8 __ vsldoi(vConstC2, vZero, vConstC2, 8); __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H - - __ vxor(vTmp1, vTmp1, vTmp1); - __ vxor(vZero, vZero, vZero); - + __ vxor(vTmp1, vTmp1, vTmp1); + __ vxor(vZero, vZero, vZero); // Calculate the number of blocks - __ mtctr(blocks); - __ li(temp1, 0); - __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); + __ mtctr(blocks); + __ li(temp1, 0); + __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); + Label loop; __ bind(loop); // Load immediate value 0 into temp __ vxor(vX,vX,vX); + __ vxor(vZero, vZero, vZero); __ li(temp1,0); - + //alignment __ andi(temp1, data, 15); __ cmpwi(CCR0,temp1,0); - __ beq(CCR0, L_align_2); + __ beq(CCR0, L_align_2); __ li(temp1,0); __ lvx(vHigh,temp1,align); __ lvsl(fromPerm,temp1,align); @@ -761,34 +763,55 @@ __ li(temp1, 0xc2); __ b(L_end_2); __ bind(L_align_2); __ lvx(vX,temp1,data); - - __ bind(L_end_2); + __ bind(L_end_2); __ lvx(fromPerm, temp2); __ vec_perm(vX, vX, vX, fromPerm); - __ addi(temp1, temp1, 16); + + __ vxor(vX,vX,vZero_Stored); // Perform GCM multiplication __ vpmsumd(vTmp1, vLowerH, vX); // L __ vpmsumd(vTmp2, vSwappedH, vX); // M __ vpmsumd(vTmp3, vHigherH, vX); // H __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction - __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH - __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH - __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap __ vxor(vTmp1, vTmp1, vTmp4); // reduction - __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction __ vxor(vTmp7, vTmp7, vTmp3); __ vxor(vZero, vTmp1, vTmp7); + __ vmr(vZero_Stored, vZero); + __ addi(data, data , 16); __ bdnz(loop); __ li(temp4, 0); + __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); + __ lvx(fromPerm, temp1); + __ vec_perm(vZero, vZero, vZero, fromPerm); + __ li(temp1, 0); + __ andi(temp1, state, 15); + __ cmpwi(CCR0,temp1,0); + __ beq(CCR0, L_aligned4);// Check if 'to' is aligned (mask lower 4 bits) + __ lvx(vHigh,temp4,state); + __ lvsr(vPerm,temp4,state); + __ addi(state,state,16); + __ lvx(vLow,temp4,state); + __ vspltisb(vConst1, -1); // Vector with 1s + __ vspltisb(vConst7, 0); // Vector with 7s + __ vec_perm(vMask,vConst7,vConst1,vPerm); + __ vec_perm(vZero,vZero,vZero,vPerm); + __ vsel(vLow,vZero,vLow,vMask); + __ vsel(vHigh,vHigh,vZero,vMask); + __ stvx(vLow,temp4,state); + __ addi(state,state,-16); + __ stvx(vHigh,temp4,state); + __ b(L_end4); + __ bind(L_aligned4); __ stvx(vZero, temp4, state); + __ bind(L_end4); __ blr(); // Return from function return start; From f41e52cc65b4b6f934a0e47c65790cfbee359385 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 9 Dec 2024 03:31:13 -0500 Subject: [PATCH 11/56] alginment for load and store --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 167 ++++++++++++---------- 1 file changed, 91 insertions(+), 76 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 36518942cf6d6..e366b1e2df4ce 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -630,19 +630,23 @@ class StubGenerator: public StubCodeGenerator { return start; } address generate_ghash_processBlocks() { + StubCodeMark mark(this, "StubRoutines", "ghash"); + address start = __ function_entry(); - Register state = R3_ARG1; // long[] st + + //Registers for parameters + Register state = R3_ARG1; // long[] st0 Register subkeyH = R4_ARG2; // long[] subH Register data = R5_ARG3; // byte[] data - Register blocks = R6_ARG4; - // __ stop("ghash start"); - // Temporary registers + Register blocks = R6_ARG4; + Register temp1 = R8; Register temp2 = R9; Register temp3 = R10; Register temp4 = R11; Register align = data; + //Vector Registers VectorRegister vH = VR0; VectorRegister vX = VR1; VectorRegister vH_shift = VR2; @@ -650,7 +654,7 @@ address generate_ghash_processBlocks() { VectorRegister vTmp2 = VR4; VectorRegister vSwappedH = VR5; VectorRegister vTmp4 = VR6; - VectorRegister vResult = VR7; + VectorRegister loadOrder = VR7; VectorRegister vMSB = VR8; VectorRegister vLowerH = VR9; VectorRegister vHigherH = VR10; @@ -669,109 +673,118 @@ address generate_ghash_processBlocks() { VectorRegister vZero_Stored = VR23; VectorRegister vMask = VR24; VectorRegister vS = VR25; - Label L_end, L_aligned, L_align_2,L_end_2,L_aligned3,L_end3,L_aligned4,L_end4; + + Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; + Label L_end3,L_aligned4,L_end4; + + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; -// Load the address of perm_pattern + + // Load the address of perm_pattern + #ifdef VM_LITTLE_ENDIAN __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); - __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); + #else + __ load_const_optimized(temp1, (uintptr_t)&perm_pattern2); + #endif + __ li(temp3,0); __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register __ lvxl(fromPerm, temp3, temp1); // Lo __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); + // Load the vector from memory into vConstC2 - __ vxor(vConstC2,vConstC2,vConstC2); + __ vxor(vConstC2, vConstC2, vConstC2); __ mtvrd(vConstC2, temp1); __ vxor(vZero, vZero, vZero); - // Load H into vector registsiers - // Use a different register (e.g., R3) + // Checking if address is 16 byte aligned and load accordingly. __ li(temp1, 0); __ andi(temp1, subkeyH, 15); - __ cmpwi(CCR0,temp1,0); - __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) + __ cmpwi(CCR0, temp1, 0); + __ beq(CCR0, L_aligned);// Check if 'to' is aligned (mask lower 4 bits) __ li(temp1, 0); // Load immediate value 0 into temp - __ vxor(vH,vH,vH); - __ lvx(vHigh, temp1, subkeyH); // Load H using temp instead of R0 - __ lvsl(vPerm,temp1,subkeyH); - __ addi(subkeyH,subkeyH,16); - __ lvx(vLow,temp1,subkeyH); + __ vxor(vH, vH, vH); + __ lvx(vHigh, temp1, subkeyH); + __ lvsl(vPerm, temp1, subkeyH); + __ addi(subkeyH, subkeyH, 16); + __ lvx(vLow, temp1, subkeyH); __ vec_perm(vH, vHigh, vLow, vPerm); - __ subi(subkeyH,subkeyH,16); + __ subi(subkeyH, subkeyH, 16); __ b(L_end); __ bind(L_aligned); __ lvx(vH,temp1,subkeyH); __ bind(L_end); __ vec_perm(vH, vH, vH, fromPerm); + __ li(temp1, 0); __ andi(temp1, state, 15); - __ cmpwi(CCR0,temp1,0); + __ cmpwi(CCR0, temp1, 0); __ beq(CCR0, L_aligned3);// Check if 'to' is aligned (mask lower 4 bits) __ li(temp1, 0); // Load immediate value 0 into temp - __ vxor(vZero_Stored,vZero_Stored,vZero_Stored); - __ lvx(vHigh, temp1, state); // Load H using temp instead of R0 + __ vxor(vZero_Stored, vZero_Stored, vZero_Stored); + __ lvx(vHigh, temp1, state);// Load H using temp instead of R0 __ lvsl(vPerm,temp1,state); __ addi(state, state, 16); __ lvx(vLow, temp1, state); __ vec_perm(vZero_Stored, vHigh, vLow, vPerm); - __ subi(state,state,16); + __ subi(state, state, 16); __ b(L_end3); __ bind(L_aligned3); - __ lvx(vZero_Stored,temp1,state); + __ lvx(vZero_Stored, temp1, state); __ bind(L_end3); __ vec_perm(vZero_Stored, vZero_Stored, vZero_Stored, fromPerm); + //Operations to obtain lower and higher bytes of subkey H. __ vspltisb(vConst1, 1); __ vspltisb(vConst7, 7); - __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 + __ vsldoi(vTmp4, vZero, vConst1, 1);// 0x1 __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 __ vsplt(vMSB, 0, vH); // MSB of H - __ vxor(vH_shift, vH_shift,vH_shift); + __ vxor(vH_shift, vH_shift, vH_shift); __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); //Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vSwappedH, vTmp2, vTmp2, 8);// swap L,H __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H __ vxor(vTmp1, vTmp1, vTmp1); __ vxor(vZero, vZero, vZero); - // Calculate the number of blocks - + // Calculate the number of blocks __ mtctr(blocks); __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); - - Label loop; - __ bind(loop); - - // Load immediate value 0 into temp - __ vxor(vX,vX,vX); + __ lvx(loadOrder, temp2); + Label loop; + __ bind(loop); + // Load immediate value 0 into temp + __ vxor(vX, vX, vX); __ vxor(vZero, vZero, vZero); - __ li(temp1,0); + __ li(temp1, 0); //alignment __ andi(temp1, data, 15); - __ cmpwi(CCR0,temp1,0); + __ cmpwi(CCR0, temp1,0); __ beq(CCR0, L_align_2); - __ li(temp1,0); - __ lvx(vHigh,temp1,align); - __ lvsl(fromPerm,temp1,align); - __ addi(align,align,16); - __ lvx(vLow,temp1,data); - __ vec_perm(vX, vHigh, vLow, fromPerm); + __ li(temp1, 0); + __ lvx(vHigh, temp1, align); + __ lvsl(vPerm, temp1, align); + __ addi(align, align, 16); + __ lvx(vLow, temp1, data); + __ vec_perm(vHigh, vHigh, vHigh, loadOrder); + __ vec_perm(vLow, vLow, vLow, loadOrder); + __ vec_perm(vX, vLow, vHigh, vPerm); __ subi(align,align,16); __ b(L_end_2); __ bind(L_align_2); - __ lvx(vX,temp1,data); + __ lvx(vX, temp1, data); + __ vec_perm(vX, vX, vX, loadOrder); __ bind(L_end_2); - __ lvx(fromPerm, temp2); - __ vec_perm(vX, vX, vX, fromPerm); - + __ vxor(vX, vX, vZero_Stored); - __ vxor(vX,vX,vZero_Stored); - // Perform GCM multiplication + // Perform GCM multiplication __ vpmsumd(vTmp1, vLowerH, vX); // L - __ vpmsumd(vTmp2, vSwappedH, vX); // M + __ vpmsumd(vTmp2, vSwappedH, vX); // M __ vpmsumd(vTmp3, vHigherH, vX); // H __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL @@ -787,32 +800,34 @@ address generate_ghash_processBlocks() { __ vmr(vZero_Stored, vZero); __ addi(data, data , 16); __ bdnz(loop); - __ li(temp4, 0); - __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); - __ lvx(fromPerm, temp1); - __ vec_perm(vZero, vZero, vZero, fromPerm); - __ li(temp1, 0); - __ andi(temp1, state, 15); - __ cmpwi(CCR0,temp1,0); - __ beq(CCR0, L_aligned4);// Check if 'to' is aligned (mask lower 4 bits) - __ lvx(vHigh,temp4,state); - __ lvsr(vPerm,temp4,state); - __ addi(state,state,16); - __ lvx(vLow,temp4,state); - __ vspltisb(vConst1, -1); // Vector with 1s - __ vspltisb(vConst7, 0); // Vector with 7s - __ vec_perm(vMask,vConst7,vConst1,vPerm); - __ vec_perm(vZero,vZero,vZero,vPerm); - __ vsel(vLow,vZero,vLow,vMask); - __ vsel(vHigh,vHigh,vZero,vMask); - __ stvx(vLow,temp4,state); - __ addi(state,state,-16); - __ stvx(vHigh,temp4,state); - __ b(L_end4); - __ bind(L_aligned4); - __ stvx(vZero, temp4, state); - __ bind(L_end4); - __ blr(); // Return from function + __ li(temp4, 0); + + __ vec_perm(vZero, vZero, vZero, fromPerm); + __ li(temp1, 0); + __ andi(temp1, state, 15); + __ cmpwi(CCR0,temp1,0); + __ beq(CCR0, L_aligned4);// Check if 'to' is aligned (mask lower 4 bits) + __ lvx(vHigh, temp4, state); + __ lvsr(vPerm, temp4, state); + __ addi(state, state, 16); + __ lvx(vLow, temp4, state); + __ vspltisb(vConst1, -1); // Vector with 1s + __ vspltisb(vConst7, 0); // Vector with 7s + __ vec_perm(vMask, vConst7, vConst1, vPerm); + __ vec_perm(vZero, vZero, vZero, vPerm); + __ vsel(vLow, vZero, vLow, vMask); + __ vsel(vHigh, vHigh, vZero, vMask); + __ stvx(vLow, temp4, state); + __ addi(state, state, -16); + __ stvx(vHigh, temp4, state); + __ b(L_end4); + __ bind(L_aligned4); + __ stvx(vZero, temp4, state); + __ bind(L_end4); + + + __ blr(); // Return from function + return start; From 8e1678d9bf0cfef005d40a460eed07338748cfd0 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 9 Dec 2024 03:32:19 -0500 Subject: [PATCH 12/56] alginment for load and store --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 78 +++++++++++------------ 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index e366b1e2df4ce..7abb64e8c63e4 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -635,18 +635,17 @@ address generate_ghash_processBlocks() { address start = __ function_entry(); - //Registers for parameters - Register state = R3_ARG1; // long[] st0 - Register subkeyH = R4_ARG2; // long[] subH - Register data = R5_ARG3; // byte[] data + // Registers for parameters + Register state = R3_ARG1; // long[] st0 + Register subkeyH = R4_ARG2; // long[] subH + Register data = R5_ARG3; // byte[] data Register blocks = R6_ARG4; - Register temp1 = R8; Register temp2 = R9; Register temp3 = R10; Register temp4 = R11; Register align = data; - //Vector Registers + // Vector Registers VectorRegister vH = VR0; VectorRegister vX = VR1; VectorRegister vH_shift = VR2; @@ -689,8 +688,8 @@ address generate_ghash_processBlocks() { #endif __ li(temp3,0); - __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register - __ lvxl(fromPerm, temp3, temp1); // Lo + __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register + __ lvxl(fromPerm, temp3, temp1); // Lo __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); @@ -702,8 +701,8 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ andi(temp1, subkeyH, 15); __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned);// Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); // Load immediate value 0 into temp + __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) + __ li(temp1, 0); __ vxor(vH, vH, vH); __ lvx(vHigh, temp1, subkeyH); __ lvsl(vPerm, temp1, subkeyH); @@ -720,11 +719,11 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ andi(temp1, state, 15); __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned3);// Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); // Load immediate value 0 into temp + __ beq(CCR0, L_aligned3); // Check if 'to' is aligned (mask lower 4 bits) + __ li(temp1, 0); __ vxor(vZero_Stored, vZero_Stored, vZero_Stored); - __ lvx(vHigh, temp1, state);// Load H using temp instead of R0 - __ lvsl(vPerm,temp1,state); + __ lvx(vHigh, temp1, state); + __ lvsl(vPerm,temp1,state); __ addi(state, state, 16); __ lvx(vLow, temp1, state); __ vec_perm(vZero_Stored, vHigh, vLow, vPerm); @@ -737,21 +736,20 @@ address generate_ghash_processBlocks() { //Operations to obtain lower and higher bytes of subkey H. __ vspltisb(vConst1, 1); __ vspltisb(vConst7, 7); - __ vsldoi(vTmp4, vZero, vConst1, 1);// 0x1 - __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 - __ vsplt(vMSB, 0, vH); // MSB of H + __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 + __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 + __ vsplt(vMSB, 0, vH); // MSB of H __ vxor(vH_shift, vH_shift, vH_shift); - __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 + __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 __ vsrab(vMSB, vMSB, vConst7); - __ vand(vMSB, vMSB, vTmp4); //Carry - __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 + __ vand(vMSB, vMSB, vTmp4); //Carry + __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vSwappedH, vTmp2, vTmp2, 8);// swap L,H - __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L - __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H + __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L + __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H __ vxor(vTmp1, vTmp1, vTmp1); __ vxor(vZero, vZero, vZero); - // Calculate the number of blocks __ mtctr(blocks); __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); @@ -783,18 +781,18 @@ address generate_ghash_processBlocks() { __ vxor(vX, vX, vZero_Stored); // Perform GCM multiplication - __ vpmsumd(vTmp1, vLowerH, vX); // L - __ vpmsumd(vTmp2, vSwappedH, vX); // M - __ vpmsumd(vTmp3, vHigherH, vX); // H - __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction - __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL - __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH - __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL - __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH - __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap - __ vxor(vTmp1, vTmp1, vTmp4); // reduction - __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap - __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction + __ vpmsumd(vTmp1, vLowerH, vX); // L + __ vpmsumd(vTmp2, vSwappedH, vX); // M + __ vpmsumd(vTmp3, vHigherH, vX); // H + __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction + __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL + __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH + __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL + __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH + __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap + __ vxor(vTmp1, vTmp1, vTmp4); // reduction + __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap + __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction __ vxor(vTmp7, vTmp7, vTmp3); __ vxor(vZero, vTmp1, vTmp7); __ vmr(vZero_Stored, vZero); @@ -806,13 +804,13 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ andi(temp1, state, 15); __ cmpwi(CCR0,temp1,0); - __ beq(CCR0, L_aligned4);// Check if 'to' is aligned (mask lower 4 bits) + __ beq(CCR0, L_aligned4); // Check if 'to' is aligned (mask lower 4 bits) __ lvx(vHigh, temp4, state); __ lvsr(vPerm, temp4, state); __ addi(state, state, 16); __ lvx(vLow, temp4, state); - __ vspltisb(vConst1, -1); // Vector with 1s - __ vspltisb(vConst7, 0); // Vector with 7s + __ vspltisb(vConst1, -1); // Vector with 1s + __ vspltisb(vConst7, 0); // Vector with 7s __ vec_perm(vMask, vConst7, vConst1, vPerm); __ vec_perm(vZero, vZero, vZero, vPerm); __ vsel(vLow, vZero, vLow, vMask); @@ -826,7 +824,7 @@ address generate_ghash_processBlocks() { __ bind(L_end4); - __ blr(); // Return from function + __ blr(); // Return from function return start; From 8ad855993e5e6c62a35d765a3f763254bb2ec284 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 9 Dec 2024 03:33:59 -0500 Subject: [PATCH 13/56] spaces --- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index 770dffccfd332..a1b4ba678e615 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -308,10 +308,6 @@ void VM_Version::initialize() { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); } - // if (UseGHASHIntrinsics && !has_Crypto_GHASH()) { - // warning("GHASH intrinsics are not available on this CPU"); - // FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); - // } if (FLAG_IS_DEFAULT(UseFMA)) { FLAG_SET_DEFAULT(UseFMA, true); From 912b8b84f7bdbad3c81aa87cb956c175c1cea504 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 9 Dec 2024 11:17:19 -0500 Subject: [PATCH 14/56] spaces fix --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 32 ++++++++++------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 7abb64e8c63e4..986491036ec4b 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -630,15 +630,14 @@ class StubGenerator: public StubCodeGenerator { return start; } address generate_ghash_processBlocks() { - StubCodeMark mark(this, "StubRoutines", "ghash"); - + address start = __ function_entry(); - + // Registers for parameters Register state = R3_ARG1; // long[] st0 Register subkeyH = R4_ARG2; // long[] subH - Register data = R5_ARG3; // byte[] data + Register data = R5_ARG3; // byte[] data Register blocks = R6_ARG4; Register temp1 = R8; Register temp2 = R9; @@ -672,14 +671,14 @@ address generate_ghash_processBlocks() { VectorRegister vZero_Stored = VR23; VectorRegister vMask = VR24; VectorRegister vS = VR25; - + Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; Label L_end3,L_aligned4,L_end4; - - + + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - + // Load the address of perm_pattern #ifdef VM_LITTLE_ENDIAN __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); @@ -702,7 +701,7 @@ address generate_ghash_processBlocks() { __ andi(temp1, subkeyH, 15); __ cmpwi(CCR0, temp1, 0); __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); + __ li(temp1, 0); __ vxor(vH, vH, vH); __ lvx(vHigh, temp1, subkeyH); __ lvsl(vPerm, temp1, subkeyH); @@ -720,10 +719,10 @@ address generate_ghash_processBlocks() { __ andi(temp1, state, 15); __ cmpwi(CCR0, temp1, 0); __ beq(CCR0, L_aligned3); // Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); + __ li(temp1, 0); __ vxor(vZero_Stored, vZero_Stored, vZero_Stored); __ lvx(vHigh, temp1, state); - __ lvsl(vPerm,temp1,state); + __ lvsl(vPerm,temp1,state); __ addi(state, state, 16); __ lvx(vLow, temp1, state); __ vec_perm(vZero_Stored, vHigh, vLow, vPerm); @@ -734,7 +733,7 @@ address generate_ghash_processBlocks() { __ bind(L_end3); __ vec_perm(vZero_Stored, vZero_Stored, vZero_Stored, fromPerm); //Operations to obtain lower and higher bytes of subkey H. - __ vspltisb(vConst1, 1); + __ vspltisb(vConst1, 1); __ vspltisb(vConst7, 7); __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 @@ -745,7 +744,7 @@ address generate_ghash_processBlocks() { __ vand(vMSB, vMSB, vTmp4); //Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H __ vxor(vTmp1, vTmp1, vTmp1); @@ -756,7 +755,7 @@ address generate_ghash_processBlocks() { __ lvx(loadOrder, temp2); Label loop; __ bind(loop); - // Load immediate value 0 into temp + // Load immediate value 0 into temp __ vxor(vX, vX, vX); __ vxor(vZero, vZero, vZero); __ li(temp1, 0); @@ -820,15 +819,12 @@ address generate_ghash_processBlocks() { __ stvx(vHigh, temp4, state); __ b(L_end4); __ bind(L_aligned4); - __ stvx(vZero, temp4, state); + __ stvx(vZero, temp4, state); __ bind(L_end4); - __ blr(); // Return from function return start; - - } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic From 8e9b25170801a27240b344bc08393eee9e2d02d2 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 9 Dec 2024 11:22:46 -0500 Subject: [PATCH 15/56] spaces fix --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 986491036ec4b..e7282388afe2b 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -675,7 +675,6 @@ address generate_ghash_processBlocks() { Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; Label L_end3,L_aligned4,L_end4; - static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; From b66e4089f1502b723194a7670973caf2ca15db48 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 10 Dec 2024 03:28:57 -0500 Subject: [PATCH 16/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index e7282388afe2b..0337b204867b8 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -675,7 +675,7 @@ address generate_ghash_processBlocks() { Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; Label L_end3,L_aligned4,L_end4; - static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; //byte order for double. static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; // Load the address of perm_pattern From 47190bf5f376822802d06a0fdea37a8387d62d2b Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Sat, 14 Dec 2024 13:12:17 -0500 Subject: [PATCH 17/56] change load instructions --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 107 +++------------------- 1 file changed, 14 insertions(+), 93 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 0337b204867b8..54553033ad712 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -646,6 +646,7 @@ address generate_ghash_processBlocks() { Register align = data; // Vector Registers VectorRegister vH = VR0; + VectorSRegister vHS = VSR32; VectorRegister vX = VR1; VectorRegister vH_shift = VR2; VectorRegister vTmp1 = VR3; @@ -669,68 +670,24 @@ address generate_ghash_processBlocks() { VectorRegister vLow = VR21; VectorRegister vPerm = VR22; VectorRegister vZero_Stored = VR23; + VectorSRegister vZero_StoredS = VSR55; VectorRegister vMask = VR24; VectorRegister vS = VR25; - + VectorSRegister vXS = VSR33; Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; - Label L_end3,L_aligned4,L_end4; - - static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; //byte order for double. - static const unsigned char perm_pattern2[16] __attribute__((aligned(16))) = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; + Label L_end3, L_aligned4, L_end4; - // Load the address of perm_pattern - #ifdef VM_LITTLE_ENDIAN - __ load_const_optimized(temp1, (uintptr_t)&perm_pattern); - #else - __ load_const_optimized(temp1, (uintptr_t)&perm_pattern2); - #endif - - __ li(temp3,0); - __ vxor(fromPerm, fromPerm, fromPerm); // Clear the vector register - __ lvxl(fromPerm, temp3, temp1); // Lo + static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); - // Load the vector from memory into vConstC2 __ vxor(vConstC2, vConstC2, vConstC2); __ mtvrd(vConstC2, temp1); __ vxor(vZero, vZero, vZero); + //__ stop("ghash"); // Checking if address is 16 byte aligned and load accordingly. - __ li(temp1, 0); - __ andi(temp1, subkeyH, 15); - __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned); // Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); - __ vxor(vH, vH, vH); - __ lvx(vHigh, temp1, subkeyH); - __ lvsl(vPerm, temp1, subkeyH); - __ addi(subkeyH, subkeyH, 16); - __ lvx(vLow, temp1, subkeyH); - __ vec_perm(vH, vHigh, vLow, vPerm); - __ subi(subkeyH, subkeyH, 16); - __ b(L_end); - __ bind(L_aligned); - __ lvx(vH,temp1,subkeyH); - __ bind(L_end); - __ vec_perm(vH, vH, vH, fromPerm); - - __ li(temp1, 0); - __ andi(temp1, state, 15); - __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned3); // Check if 'to' is aligned (mask lower 4 bits) - __ li(temp1, 0); - __ vxor(vZero_Stored, vZero_Stored, vZero_Stored); - __ lvx(vHigh, temp1, state); - __ lvsl(vPerm,temp1,state); - __ addi(state, state, 16); - __ lvx(vLow, temp1, state); - __ vec_perm(vZero_Stored, vHigh, vLow, vPerm); - __ subi(state, state, 16); - __ b(L_end3); - __ bind(L_aligned3); - __ lvx(vZero_Stored, temp1, state); - __ bind(L_end3); - __ vec_perm(vZero_Stored, vZero_Stored, vZero_Stored, fromPerm); + __ lxvd2x(vHS, subkeyH); + __ lxvd2x(vZero_StoredS, state); //Operations to obtain lower and higher bytes of subkey H. __ vspltisb(vConst1, 1); __ vspltisb(vConst7, 7); @@ -750,34 +707,18 @@ address generate_ghash_processBlocks() { __ vxor(vZero, vZero, vZero); __ mtctr(blocks); __ li(temp1, 0); - __ load_const_optimized(temp2, (uintptr_t)&perm_pattern2); + __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); __ lvx(loadOrder, temp2); + Label loop; __ bind(loop); // Load immediate value 0 into temp __ vxor(vX, vX, vX); __ vxor(vZero, vZero, vZero); __ li(temp1, 0); - //alignment - __ andi(temp1, data, 15); - __ cmpwi(CCR0, temp1,0); - __ beq(CCR0, L_align_2); - __ li(temp1, 0); - __ lvx(vHigh, temp1, align); - __ lvsl(vPerm, temp1, align); - __ addi(align, align, 16); - __ lvx(vLow, temp1, data); - __ vec_perm(vHigh, vHigh, vHigh, loadOrder); - __ vec_perm(vLow, vLow, vLow, loadOrder); - __ vec_perm(vX, vLow, vHigh, vPerm); - __ subi(align,align,16); - __ b(L_end_2); - __ bind(L_align_2); - __ lvx(vX, temp1, data); + __ lxv(vXS, 0, data); __ vec_perm(vX, vX, vX, loadOrder); - __ bind(L_end_2); __ vxor(vX, vX, vZero_Stored); - // Perform GCM multiplication __ vpmsumd(vTmp1, vLowerH, vX); // L __ vpmsumd(vTmp2, vSwappedH, vX); // M @@ -794,32 +735,12 @@ address generate_ghash_processBlocks() { __ vxor(vTmp7, vTmp7, vTmp3); __ vxor(vZero, vTmp1, vTmp7); __ vmr(vZero_Stored, vZero); - __ addi(data, data , 16); + __ addi(data, data, 16); __ bdnz(loop); __ li(temp4, 0); - __ vec_perm(vZero, vZero, vZero, fromPerm); - __ li(temp1, 0); - __ andi(temp1, state, 15); - __ cmpwi(CCR0,temp1,0); - __ beq(CCR0, L_aligned4); // Check if 'to' is aligned (mask lower 4 bits) - __ lvx(vHigh, temp4, state); - __ lvsr(vPerm, temp4, state); - __ addi(state, state, 16); - __ lvx(vLow, temp4, state); - __ vspltisb(vConst1, -1); // Vector with 1s - __ vspltisb(vConst7, 0); // Vector with 7s - __ vec_perm(vMask, vConst7, vConst1, vPerm); - __ vec_perm(vZero, vZero, vZero, vPerm); - __ vsel(vLow, vZero, vLow, vMask); - __ vsel(vHigh, vHigh, vZero, vMask); - __ stvx(vLow, temp4, state); - __ addi(state, state, -16); - __ stvx(vHigh, temp4, state); - __ b(L_end4); - __ bind(L_aligned4); - __ stvx(vZero, temp4, state); - __ bind(L_end4); + __ stxvd2x(vZero->to_vsr(),state); + __ blr(); // Return from function From ff901a86b4d7fce864c35edc9aba3549e68d25d6 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 16 Dec 2024 11:17:02 -0500 Subject: [PATCH 18/56] using power 8 loadinstructions --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 28 +++++++++++++---------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 54553033ad712..3b97d3b2b50ad 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -631,7 +631,6 @@ class StubGenerator: public StubCodeGenerator { } address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); - address start = __ function_entry(); // Registers for parameters @@ -661,7 +660,6 @@ address generate_ghash_processBlocks() { VectorRegister vConst1 = VR12; VectorRegister vConst7 = VR13; VectorRegister vConstC2 = VR14; - VectorRegister fromPerm = VR15; VectorRegister vTmp3 = VR16; VectorRegister vTmp5 = VR17; VectorRegister vTmp6 = VR18; @@ -674,8 +672,7 @@ address generate_ghash_processBlocks() { VectorRegister vMask = VR24; VectorRegister vS = VR25; VectorSRegister vXS = VSR33; - Label L_end, L_aligned, L_align_2, L_end_2, L_aligned3; - Label L_end3, L_aligned4, L_end4; + Label L_end, L_aligned; static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; __ li(temp1, 0xc2); @@ -684,7 +681,6 @@ address generate_ghash_processBlocks() { __ vxor(vConstC2, vConstC2, vConstC2); __ mtvrd(vConstC2, temp1); __ vxor(vZero, vZero, vZero); - //__ stop("ghash"); // Checking if address is 16 byte aligned and load accordingly. __ lxvd2x(vHS, subkeyH); __ lxvd2x(vZero_StoredS, state); @@ -709,14 +705,26 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); __ lvx(loadOrder, temp2); - Label loop; __ bind(loop); // Load immediate value 0 into temp - __ vxor(vX, vX, vX); __ vxor(vZero, vZero, vZero); __ li(temp1, 0); - __ lxv(vXS, 0, data); + __ andi(temp1, data, 15); + __ cmpwi(CCR0, temp1, 0); + __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) + __ li(temp1, 0); + __ lvx(vHigh, temp1, data); + __ lvsl(vPerm, temp1, data); + __ addi(data, data, 16); + __ lvx(vLow, temp1, data); + __ vec_perm(vX, vHigh, vLow, vPerm); + __ subi(data, data, 16); + __ b(L_end); + __ bind(L_aligned); + __ li(temp1, 0); + __ lvx(vX, temp1, data); + __ bind(L_end); __ vec_perm(vX, vX, vX, loadOrder); __ vxor(vX, vX, vZero_Stored); // Perform GCM multiplication @@ -738,14 +746,10 @@ address generate_ghash_processBlocks() { __ addi(data, data, 16); __ bdnz(loop); __ li(temp4, 0); - __ stxvd2x(vZero->to_vsr(),state); - - __ blr(); // Return from function return start; - } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic // From acbca2dd6af1dccc6ffb152fd03236b1ef9f6a00 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 18 Dec 2024 04:04:27 -0500 Subject: [PATCH 19/56] using power 8 loadinstructions --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 34 +++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 3b97d3b2b50ad..1e3ebb92bc2be 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -684,27 +684,28 @@ address generate_ghash_processBlocks() { // Checking if address is 16 byte aligned and load accordingly. __ lxvd2x(vHS, subkeyH); __ lxvd2x(vZero_StoredS, state); - //Operations to obtain lower and higher bytes of subkey H. + // Operations to obtain lower and higher bytes of subkey H. __ vspltisb(vConst1, 1); __ vspltisb(vConst7, 7); __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 - __ vor(vTmp4, vConstC2, vTmp4); //0xC2...1 + __ vor(vTmp4, vConstC2, vTmp4); // 0xC2...1 __ vsplt(vMSB, 0, vH); // MSB of H __ vxor(vH_shift, vH_shift, vH_shift); __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 __ vsrab(vMSB, vMSB, vConst7); - __ vand(vMSB, vMSB, vTmp4); //Carry + __ vand(vMSB, vMSB, vTmp4); // Carry __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 __ vsldoi(vConstC2, vZero, vConstC2, 8); __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H - __ vsldoi(vLowerH, vZero, vSwappedH, 8); //H.L - __ vsldoi(vHigherH, vSwappedH, vZero, 8); //H.H + __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L + __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H __ vxor(vTmp1, vTmp1, vTmp1); __ vxor(vZero, vZero, vZero); __ mtctr(blocks); __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); __ lvx(loadOrder, temp2); + // Performing Karatsuba multiplication in Galois fields Label loop; __ bind(loop); // Load immediate value 0 into temp @@ -728,24 +729,23 @@ address generate_ghash_processBlocks() { __ vec_perm(vX, vX, vX, loadOrder); __ vxor(vX, vX, vZero_Stored); // Perform GCM multiplication - __ vpmsumd(vTmp1, vLowerH, vX); // L - __ vpmsumd(vTmp2, vSwappedH, vX); // M - __ vpmsumd(vTmp3, vHigherH, vX); // H + __ vpmsumd(vTmp1, vLowerH, vX); // L : Lower Half of subkey H + __ vpmsumd(vTmp2, vSwappedH, vX); // M : Combined halves of subkey H + __ vpmsumd(vTmp3, vHigherH, vX); // H : Higher Half of subkeyH __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction - __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL - __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH - __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL - __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH + __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL : Extract the lower 64 bits of M + __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH : Extract the higher 64 bits of M + __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL : Combine L and mL (partial result for lower half) + __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH : Combine H and mH (partial result for upper half) __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap - __ vxor(vTmp1, vTmp1, vTmp4); // reduction + __ vxor(vTmp1, vTmp1, vTmp4); // reduction using the reduction constant __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap - __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction - __ vxor(vTmp7, vTmp7, vTmp3); - __ vxor(vZero, vTmp1, vTmp7); + __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction using the reduction constant + __ vxor(vTmp7, vTmp7, vTmp3); // Combine the reduced Low and High products + __ vxor(vZero, vTmp1, vTmp7); __ vmr(vZero_Stored, vZero); __ addi(data, data, 16); __ bdnz(loop); - __ li(temp4, 0); __ stxvd2x(vZero->to_vsr(),state); __ blr(); // Return from function From 1aebac84245e450bc15410df533d9563ea560a2c Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 18 Dec 2024 04:06:15 -0500 Subject: [PATCH 20/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 1e3ebb92bc2be..145de944a41fe 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -675,13 +675,13 @@ address generate_ghash_processBlocks() { Label L_end, L_aligned; static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 __ vxor(vConstC2, vConstC2, vConstC2); __ mtvrd(vConstC2, temp1); __ vxor(vZero, vZero, vZero); - // Checking if address is 16 byte aligned and load accordingly. __ lxvd2x(vHS, subkeyH); __ lxvd2x(vZero_StoredS, state); // Operations to obtain lower and higher bytes of subkey H. From b2766a9cd5be6b5e0a2f16da88b1966a92aa3746 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 18 Dec 2024 04:11:45 -0500 Subject: [PATCH 21/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 145de944a41fe..1146f2bd6b98d 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -629,12 +629,20 @@ class StubGenerator: public StubCodeGenerator { return start; } + +// Generate stub for ghash process blocks. +// +// Arguments for generated stub: +// state: R3_ARG1 +// subkeyH: R4_ARG2 +// data: R5_ARG3 +// address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); // Registers for parameters - Register state = R3_ARG1; // long[] st0 + Register state = R3_ARG1; // long[] state Register subkeyH = R4_ARG2; // long[] subH Register data = R5_ARG3; // byte[] data Register blocks = R6_ARG4; @@ -691,12 +699,12 @@ address generate_ghash_processBlocks() { __ vor(vTmp4, vConstC2, vTmp4); // 0xC2...1 __ vsplt(vMSB, 0, vH); // MSB of H __ vxor(vH_shift, vH_shift, vH_shift); - __ vsl(vH_shift, vH, vConst1); // Carry= H<<7 + __ vsl(vH_shift, vH, vConst1); // Carry = H<<7 __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); // Carry - __ vxor(vTmp2, vH_shift, vMSB); // shift H<<<1 + __ vxor(vTmp2, vH_shift, vMSB); __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap L,H + __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap Lower and Higher Halves of subkey H __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H __ vxor(vTmp1, vTmp1, vTmp1); From f357d6eb1ba552c560bc9b8ed4ad48200ea99b1f Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 18 Dec 2024 06:25:52 -0500 Subject: [PATCH 22/56] spaces fix --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 1146f2bd6b98d..72940de79515a 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -683,7 +683,7 @@ address generate_ghash_processBlocks() { Label L_end, L_aligned; static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - + __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); // Load the vector from memory into vConstC2 @@ -702,7 +702,7 @@ address generate_ghash_processBlocks() { __ vsl(vH_shift, vH, vConst1); // Carry = H<<7 __ vsrab(vMSB, vMSB, vConst7); __ vand(vMSB, vMSB, vTmp4); // Carry - __ vxor(vTmp2, vH_shift, vMSB); + __ vxor(vTmp2, vH_shift, vMSB); __ vsldoi(vConstC2, vZero, vConstC2, 8); __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap Lower and Higher Halves of subkey H __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L @@ -713,7 +713,7 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); __ lvx(loadOrder, temp2); - // Performing Karatsuba multiplication in Galois fields + // Performing Karatsuba multiplication in Galois fields Label loop; __ bind(loop); // Load immediate value 0 into temp @@ -750,11 +750,11 @@ address generate_ghash_processBlocks() { __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction using the reduction constant __ vxor(vTmp7, vTmp7, vTmp3); // Combine the reduced Low and High products - __ vxor(vZero, vTmp1, vTmp7); + __ vxor(vZero, vTmp1, vTmp7); __ vmr(vZero_Stored, vZero); __ addi(data, data, 16); __ bdnz(loop); - __ stxvd2x(vZero->to_vsr(),state); + __ stxvd2x(vZero->to_vsr(), state); __ blr(); // Return from function return start; From f4d2e7532eea40aa4dd9f1100411cb782874535e Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 18 Dec 2024 06:34:36 -0500 Subject: [PATCH 23/56] spaces fix --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 72940de79515a..04cdfcef03f16 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -713,7 +713,7 @@ address generate_ghash_processBlocks() { __ li(temp1, 0); __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); __ lvx(loadOrder, temp2); - // Performing Karatsuba multiplication in Galois fields + // Performing Karatsuba multiplication in Galois fields Label loop; __ bind(loop); // Load immediate value 0 into temp From 3487414c4565e403902f40766be19bd48b0a5965 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 8 Jan 2025 12:22:30 -0500 Subject: [PATCH 24/56] reuse registers to reduce count --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 141 ++++++++++------------ 1 file changed, 65 insertions(+), 76 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 04cdfcef03f16..cde93273c140d 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -630,13 +630,14 @@ class StubGenerator: public StubCodeGenerator { return start; } -// Generate stub for ghash process blocks. +// Generate stub for ghash process blocks. // // Arguments for generated stub: // state: R3_ARG1 // subkeyH: R4_ARG2 // data: R5_ARG3 -// +// blocks: R6_ARG4 +// address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); @@ -651,68 +652,50 @@ address generate_ghash_processBlocks() { Register temp3 = R10; Register temp4 = R11; Register align = data; + Register load = R12; // Vector Registers - VectorRegister vH = VR0; - VectorSRegister vHS = VSR32; - VectorRegister vX = VR1; - VectorRegister vH_shift = VR2; - VectorRegister vTmp1 = VR3; - VectorRegister vTmp2 = VR4; - VectorRegister vSwappedH = VR5; - VectorRegister vTmp4 = VR6; - VectorRegister loadOrder = VR7; - VectorRegister vMSB = VR8; - VectorRegister vLowerH = VR9; - VectorRegister vHigherH = VR10; - VectorRegister vZero = VR11; - VectorRegister vConst1 = VR12; - VectorRegister vConst7 = VR13; - VectorRegister vConstC2 = VR14; - VectorRegister vTmp3 = VR16; - VectorRegister vTmp5 = VR17; - VectorRegister vTmp6 = VR18; - VectorRegister vTmp7 = VR19; - VectorRegister vHigh = VR20; - VectorRegister vLow = VR21; - VectorRegister vPerm = VR22; - VectorRegister vZero_Stored = VR23; - VectorSRegister vZero_StoredS = VSR55; - VectorRegister vMask = VR24; - VectorRegister vS = VR25; - VectorSRegister vXS = VSR33; + VectorRegister vZero = VR0; + VectorRegister vH = VR1; + VectorRegister vLowerH = VR2; + VectorRegister vHigherH = VR3; + VectorRegister vTmp4 = VR4; + VectorRegister vTmp5 = VR5; + VectorRegister vTmp6 = VR6; + VectorRegister vTmp7 = VR7; + VectorRegister vTmp8 = VR8; + VectorRegister vTmp9 = VR9; + VectorRegister vTmp10 = VR10; + VectorRegister vTmp11 = VR11; + VectorRegister vTmp12 = VR12; + VectorRegister loadOrder = VR13; + VectorRegister vHigh = VR14; + VectorRegister vLow = VR15; + VectorRegister vState = VR16; + VectorRegister vConstC2 = VR19; Label L_end, L_aligned; - static const unsigned char perm_pattern[16] __attribute__((aligned(16))) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); + __ vxor(vZero, vZero, vZero); // Load the vector from memory into vConstC2 - __ vxor(vConstC2, vConstC2, vConstC2); __ mtvrd(vConstC2, temp1); - __ vxor(vZero, vZero, vZero); - __ lxvd2x(vHS, subkeyH); - __ lxvd2x(vZero_StoredS, state); + __ lxvd2x(vH->to_vsr(), subkeyH); + __ lxvd2x(vState->to_vsr(), state); // Operations to obtain lower and higher bytes of subkey H. - __ vspltisb(vConst1, 1); - __ vspltisb(vConst7, 7); - __ vsldoi(vTmp4, vZero, vConst1, 1); // 0x1 - __ vor(vTmp4, vConstC2, vTmp4); // 0xC2...1 - __ vsplt(vMSB, 0, vH); // MSB of H - __ vxor(vH_shift, vH_shift, vH_shift); - __ vsl(vH_shift, vH, vConst1); // Carry = H<<7 - __ vsrab(vMSB, vMSB, vConst7); - __ vand(vMSB, vMSB, vTmp4); // Carry - __ vxor(vTmp2, vH_shift, vMSB); + __ vspltisb(vTmp7, 1); + __ vspltisb(vTmp10, 7); + __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 + __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 + __ vsplt(vTmp9, 0, vH); // MSB of H + __ vsl(vH, vH, vTmp7); // Carry = H<<7 + __ vsrab(vTmp9, vTmp9, vTmp10); + __ vand(vTmp9, vTmp9, vTmp8); // Carry + __ vxor(vTmp10, vH, vTmp9); __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vSwappedH, vTmp2, vTmp2, 8); // swap Lower and Higher Halves of subkey H - __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L - __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H - __ vxor(vTmp1, vTmp1, vTmp1); - __ vxor(vZero, vZero, vZero); + __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H + __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L + __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H __ mtctr(blocks); - __ li(temp1, 0); - __ load_const_optimized(temp2, (uintptr_t)&perm_pattern); - __ lvx(loadOrder, temp2); // Performing Karatsuba multiplication in Galois fields Label loop; __ bind(loop); @@ -724,39 +707,45 @@ address generate_ghash_processBlocks() { __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) __ li(temp1, 0); __ lvx(vHigh, temp1, data); - __ lvsl(vPerm, temp1, data); + __ lvsl(loadOrder, temp1, data); __ addi(data, data, 16); __ lvx(vLow, temp1, data); - __ vec_perm(vX, vHigh, vLow, vPerm); + __ vec_perm(vH, vHigh, vLow, loadOrder); __ subi(data, data, 16); __ b(L_end); __ bind(L_aligned); __ li(temp1, 0); - __ lvx(vX, temp1, data); + __ lvx(vH, temp1, data); __ bind(L_end); - __ vec_perm(vX, vX, vX, loadOrder); - __ vxor(vX, vX, vZero_Stored); + __ li(temp1, 0); + __ lvsl(loadOrder, temp1); + #ifdef VM_LITTLE_ENDIAN + __ vspltisb(vTmp12, 0xf); + __ vxor(loadOrder, loadOrder, vTmp12); + #endif + __ vec_perm(vH, vH, vH, loadOrder); + __ vxor(vH, vH, vState); // Perform GCM multiplication - __ vpmsumd(vTmp1, vLowerH, vX); // L : Lower Half of subkey H - __ vpmsumd(vTmp2, vSwappedH, vX); // M : Combined halves of subkey H - __ vpmsumd(vTmp3, vHigherH, vX); // H : Higher Half of subkeyH - __ vpmsumd(vTmp4, vTmp1, vConstC2); // reduction - __ vsldoi(vTmp5, vTmp2, vZero, 8); // mL : Extract the lower 64 bits of M - __ vsldoi(vTmp6, vZero, vTmp2, 8); // mH : Extract the higher 64 bits of M - __ vxor(vTmp1, vTmp1, vTmp5); // LL + LL : Combine L and mL (partial result for lower half) - __ vxor(vTmp3, vTmp3, vTmp6); // HH + HH : Combine H and mH (partial result for upper half) - __ vsldoi(vTmp1, vTmp1, vTmp1, 8); // swap - __ vxor(vTmp1, vTmp1, vTmp4); // reduction using the reduction constant - __ vsldoi(vTmp7, vTmp1, vTmp1, 8); // swap - __ vpmsumd(vTmp1, vTmp1, vConstC2); // reduction using the reduction constant - __ vxor(vTmp7, vTmp7, vTmp3); // Combine the reduced Low and High products - __ vxor(vZero, vTmp1, vTmp7); - __ vmr(vZero_Stored, vZero); + __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H + __ vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H + __ vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkeyH + __ vpmsumd(vTmp7, vTmp4, vConstC2); // reduction + __ vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M + __ vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M + __ vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Combine L and mL (partial result for lower half) + __ vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Combine H and mH (partial result for upper half) + __ vsldoi(vTmp4, vTmp4, vTmp4, 8); // swap + __ vxor(vTmp4, vTmp4, vTmp7); // reduction using the reduction constant + __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap + __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant + __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products + __ vxor(vZero, vTmp4, vTmp10); + __ vmr(vState, vZero); __ addi(data, data, 16); __ bdnz(loop); __ stxvd2x(vZero->to_vsr(), state); - __ blr(); // Return from function - + + __ blr(); return start; } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic From 59acef15763d685a5e46743ab5d97d6098a325f4 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 8 Jan 2025 12:27:03 -0500 Subject: [PATCH 25/56] check for vsx --- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index a1b4ba678e615..4efbe1fb526cc 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -305,7 +305,7 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics) && VM_Version::has_vsx()) { FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); } From c323460d4952f3d15dfb6e9a707ece85e9a47bff Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 8 Jan 2025 12:36:16 -0500 Subject: [PATCH 26/56] clearing bits --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index cde93273c140d..b92e7b9de2f3e 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -695,6 +695,7 @@ address generate_ghash_processBlocks() { __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H + __ clrldi(blocks, blocks, 32); __ mtctr(blocks); // Performing Karatsuba multiplication in Galois fields Label loop; @@ -744,7 +745,7 @@ address generate_ghash_processBlocks() { __ addi(data, data, 16); __ bdnz(loop); __ stxvd2x(vZero->to_vsr(), state); - + __ blr(); return start; } From 6970291906f49832df68562da87eb76941b20835 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 8 Jan 2025 12:38:16 -0500 Subject: [PATCH 27/56] clearing bits --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index b92e7b9de2f3e..e8edded08cd1b 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -637,7 +637,7 @@ class StubGenerator: public StubCodeGenerator { // subkeyH: R4_ARG2 // data: R5_ARG3 // blocks: R6_ARG4 -// +// address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); address start = __ function_entry(); From 09df7626b45006c29b10c4712574e34b047a4910 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 9 Jan 2025 03:19:26 -0500 Subject: [PATCH 28/56] assertion for blocks --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index e8edded08cd1b..23eb9f8255469 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -672,7 +672,7 @@ address generate_ghash_processBlocks() { VectorRegister vLow = VR15; VectorRegister vState = VR16; VectorRegister vConstC2 = VR19; - Label L_end, L_aligned; + Label L_end, L_aligned, L_error; __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); @@ -695,6 +695,10 @@ address generate_ghash_processBlocks() { __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H + #ifdef ASSERT + __ cmpwi(CCR0, blocks, 0); + __ beq(CCR0, L_error); + #endif __ clrldi(blocks, blocks, 32); __ mtctr(blocks); // Performing Karatsuba multiplication in Galois fields @@ -745,8 +749,11 @@ address generate_ghash_processBlocks() { __ addi(data, data, 16); __ bdnz(loop); __ stxvd2x(vZero->to_vsr(), state); - __ blr(); + #ifdef ASSERT + __ bind(L_error); + __ stop("Number of blocks must be positive"); + #endif return start; } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic From eadd408c91d5f2268b4e21ce2a787e6daee33edd Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 9 Jan 2025 04:03:10 -0500 Subject: [PATCH 29/56] restore --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 29 +++++++++++++---------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 23eb9f8255469..f92dc55298b08 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -1,4 +1,5 @@ /* + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2024 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -23,6 +24,7 @@ * */ +#include "precompiled.hpp" #include "asm/macroAssembler.inline.hpp" #include "compiler/oopMap.hpp" #include "gc/shared/barrierSet.hpp" @@ -621,7 +623,7 @@ class StubGenerator: public StubCodeGenerator { // Don't generate, rather use C++ code. address generate_verify_oop() { // this is actually a `FunctionDescriptor*'. - address start = nullptr; + address start = 0; #if !defined(PRODUCT) start = CAST_FROM_FN_PTR(address, verify_oop_helper); @@ -752,7 +754,7 @@ address generate_ghash_processBlocks() { __ blr(); #ifdef ASSERT __ bind(L_error); - __ stop("Number of blocks must be positive"); + __ stop("ghash_processBlocks : number of blocks must be positive"); #endif return start; } @@ -2164,8 +2166,7 @@ address generate_ghash_processBlocks() { void generate_type_check(Register sub_klass, Register super_check_offset, Register super_klass, - Register temp1, - Register temp2, + Register temp, Label& L_success) { assert_different_registers(sub_klass, super_check_offset, super_klass); @@ -2173,9 +2174,9 @@ address generate_ghash_processBlocks() { Label L_miss; - __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr, + __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, nullptr, super_check_offset); - __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success); + __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success); // Fall through on failure! __ bind(L_miss); @@ -2205,7 +2206,8 @@ address generate_ghash_processBlocks() { const Register R10_oop = R10_ARG8; // actual oop copied const Register R11_klass = R11_scratch1; // oop._klass const Register R12_tmp = R12_scratch2; - const Register R2_tmp = R2; + + const Register R2_minus1 = R2; //__ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); @@ -2243,6 +2245,7 @@ address generate_ghash_processBlocks() { Label load_element, store_element, store_null, success, do_epilogue; __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it. __ li(R8_offset, 0); // Offset from start of arrays. + __ li(R2_minus1, -1); __ bne(CCR0, load_element); // Empty array: Nothing to do. @@ -2270,7 +2273,7 @@ address generate_ghash_processBlocks() { } __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset. - __ addic_(R9_remain, R9_remain, -1); // Decrement the count. + __ add_(R9_remain, R2_minus1, R9_remain); // Decrement the count. __ beq(CCR0, success); // ======== loop entry is here ======== @@ -2290,7 +2293,7 @@ address generate_ghash_processBlocks() { __ load_klass(R11_klass, R10_oop); // Query the object klass. - generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp, + generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, // Branch to this on success: store_element); // ======== end loop ======== @@ -2624,7 +2627,7 @@ address generate_ghash_processBlocks() { int sco_offset = in_bytes(Klass::super_check_offset_offset()); __ lwz(sco_temp, sco_offset, dst_klass); generate_type_check(src_klass, sco_temp, dst_klass, - temp, /* temp */ R10_ARG8, L_disjoint_plain_copy); + temp, L_disjoint_plain_copy); // Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); @@ -4582,9 +4585,9 @@ address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { r_bitmap = R11_scratch1, result = R8_ARG6; - __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, - r_array_base, r_array_length, r_array_index, - r_bitmap, result, super_klass_index); + __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, + r_array_base, r_array_length, r_array_index, + r_bitmap, result, super_klass_index); __ blr(); return start; From 3bd8a272b4269be7a90090144b52943c797a6613 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 22 Jan 2025 03:06:18 -0500 Subject: [PATCH 30/56] Comments and vsx check --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 42 +++++++++++++++++++---- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 6 +++- 2 files changed, 40 insertions(+), 8 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index f92dc55298b08..7e00edd933886 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -635,10 +635,20 @@ class StubGenerator: public StubCodeGenerator { // Generate stub for ghash process blocks. // // Arguments for generated stub: -// state: R3_ARG1 -// subkeyH: R4_ARG2 -// data: R5_ARG3 -// blocks: R6_ARG4 +// state: R3_ARG1 (long[] state) +// subkeyH: R4_ARG2 (long[] subH) +// data: R5_ARG3 (byte[] data) +// blocks: R6_ARG4 (number of 16-byte blocks to process) +// +// The polynomials are processed in bit-reflected order for efficiency reasons. +// This optimization leverages the structure of the Galois field arithmetic +// to minimize the number of bit manipulations required during multiplication. +// For an explanation of how this works, refer : +// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, +// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel® +// Architecture Processor" +// http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf +// // address generate_ghash_processBlocks() { StubCodeMark mark(this, "StubRoutines", "ghash"); @@ -678,8 +688,7 @@ address generate_ghash_processBlocks() { __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); - __ vxor(vZero, vZero, vZero); - // Load the vector from memory into vConstC2 + __ vspltisb(vZero, 0); __ mtvrd(vConstC2, temp1); __ lxvd2x(vH->to_vsr(), subkeyH); __ lxvd2x(vState->to_vsr(), state); @@ -703,7 +712,26 @@ address generate_ghash_processBlocks() { #endif __ clrldi(blocks, blocks, 32); __ mtctr(blocks); - // Performing Karatsuba multiplication in Galois fields + // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. + // + // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, + // performing three 128-bit multiplications and combining the results efficiently. + // + // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) + // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // + // Inputs: + // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half). + // - vLowerH: Lower half of the subkey H (A0). + // - vHigherH: Higher half of the subkey H (A1). + // - vConstC2: Constant used for reduction (for final processing). + // + // References: + // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, Martin Dixon. + // "Optimized Galois-Counter-Mode Implementation on Intel® Architecture Processor" + // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 + // + // Label loop; __ bind(loop); // Load immediate value 0 into temp diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index 4efbe1fb526cc..bdf128fc58c77 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -305,7 +305,11 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (FLAG_IS_DEFAULT(UseGHASHIntrinsics) && VM_Version::has_vsx()) { + if (!VM_Version::has_vsx()) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + else if (UseGHASHIntrinsics) { FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); } From 047142fe3822d1359532324e20609e4d8f705f59 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 22 Jan 2025 03:10:24 -0500 Subject: [PATCH 31/56] update references --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 7e00edd933886..7165f918f2601 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -727,11 +727,10 @@ address generate_ghash_processBlocks() { // - vConstC2: Constant used for reduction (for final processing). // // References: - // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, Martin Dixon. - // "Optimized Galois-Counter-Mode Implementation on Intel® Architecture Processor" + // Shay Gueron, Michael E. Kounavis. + // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // - // Label loop; __ bind(loop); // Load immediate value 0 into temp From 61cb9738be582aadde2df546d159e4cad0558b46 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 22 Jan 2025 03:18:10 -0500 Subject: [PATCH 32/56] spaces --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 17 ++++++++--------- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 7165f918f2601..c41c8b4fa3be2 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -1,7 +1,6 @@ /* - * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2024 SAP SE. All rights reserved. + * Copyright (c) 2012, 2025 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -640,12 +639,12 @@ class StubGenerator: public StubCodeGenerator { // data: R5_ARG3 (byte[] data) // blocks: R6_ARG4 (number of 16-byte blocks to process) // -// The polynomials are processed in bit-reflected order for efficiency reasons. -// This optimization leverages the structure of the Galois field arithmetic +// The polynomials are processed in bit-reflected order for efficiency reasons. +// This optimization leverages the structure of the Galois field arithmetic // to minimize the number of bit manipulations required during multiplication. // For an explanation of how this works, refer : -// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, -// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel® +// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, +// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel® // Architecture Processor" // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf // @@ -712,11 +711,11 @@ address generate_ghash_processBlocks() { #endif __ clrldi(blocks, blocks, 32); __ mtctr(blocks); - // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. + // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. // // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, // performing three 128-bit multiplications and combining the results efficiently. - // + // // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 // @@ -727,7 +726,7 @@ address generate_ghash_processBlocks() { // - vConstC2: Constant used for reduction (for final processing). // // References: - // Shay Gueron, Michael E. Kounavis. + // Shay Gueron, Michael E. Kounavis. // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index bdf128fc58c77..eca18c79829cc 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -1,6 +1,6 @@ /* * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2024 SAP SE. All rights reserved. + * Copyright (c) 2012, 2025 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From 9ac07dc5e081ccbfcee9244921be1c64dcbe376e Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 22 Jan 2025 03:22:55 -0500 Subject: [PATCH 33/56] spaces --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index c41c8b4fa3be2..2c213f0145881 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -724,7 +724,7 @@ address generate_ghash_processBlocks() { // - vLowerH: Lower half of the subkey H (A0). // - vHigherH: Higher half of the subkey H (A1). // - vConstC2: Constant used for reduction (for final processing). - // + // // References: // Shay Gueron, Michael E. Kounavis. // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" From 41b1d8c31426d455984cc12c37aa905f0841d34b Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 22 Jan 2025 10:04:21 -0500 Subject: [PATCH 34/56] vsx logic change --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 4 ++-- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 2c213f0145881..a310b2bbff912 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -769,9 +769,9 @@ address generate_ghash_processBlocks() { __ vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Combine H and mH (partial result for upper half) __ vsldoi(vTmp4, vTmp4, vTmp4, 8); // swap __ vxor(vTmp4, vTmp4, vTmp7); // reduction using the reduction constant - __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap + __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant - __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products + __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products __ vxor(vZero, vTmp4, vTmp10); __ vmr(vState, vZero); __ addi(data, data, 16); diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index eca18c79829cc..1facc703762a2 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -305,12 +305,15 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (!VM_Version::has_vsx()) { - warning("GHASH intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); - } - else if (UseGHASHIntrinsics) { - FLAG_SET_DEFAULT(UseGHASHIntrinsics, true); + if (VM_Version::has_vsx()) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + warning("GHASH intrinsics are not available on this CPU"); + } + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } if (FLAG_IS_DEFAULT(UseFMA)) { From bca7f698afdb3556cd1f18711b32b98955b1be07 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 23 Jan 2025 03:05:58 -0500 Subject: [PATCH 35/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index a310b2bbff912..b07b5ffac6069 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -694,17 +694,17 @@ address generate_ghash_processBlocks() { // Operations to obtain lower and higher bytes of subkey H. __ vspltisb(vTmp7, 1); __ vspltisb(vTmp10, 7); - __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 + __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 - __ vsplt(vTmp9, 0, vH); // MSB of H - __ vsl(vH, vH, vTmp7); // Carry = H<<7 + __ vsplt(vTmp9, 0, vH); // MSB of H + __ vsl(vH, vH, vTmp7); // Carry = H<<7 __ vsrab(vTmp9, vTmp9, vTmp10); - __ vand(vTmp9, vTmp9, vTmp8); // Carry + __ vand(vTmp9, vTmp9, vTmp8); // Carry __ vxor(vTmp10, vH, vTmp9); __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H - __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L - __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H + __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H + __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L + __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H #ifdef ASSERT __ cmpwi(CCR0, blocks, 0); __ beq(CCR0, L_error); @@ -732,12 +732,11 @@ address generate_ghash_processBlocks() { // Label loop; __ bind(loop); - // Load immediate value 0 into temp - __ vxor(vZero, vZero, vZero); + __ vspltisb(vZero, 0); __ li(temp1, 0); __ andi(temp1, data, 15); __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) + __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) __ li(temp1, 0); __ lvx(vHigh, temp1, data); __ lvsl(loadOrder, temp1, data); @@ -758,7 +757,7 @@ address generate_ghash_processBlocks() { #endif __ vec_perm(vH, vH, vH, loadOrder); __ vxor(vH, vH, vState); - // Perform GCM multiplication + // Perform GCM multiplication __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H __ vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H __ vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkeyH From 50ca470da8a96149b9634437ceb29b7daf00eac8 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 23 Jan 2025 22:13:28 +0530 Subject: [PATCH 36/56] indentation --- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index 1facc703762a2..d5612553e7af4 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -281,7 +281,7 @@ void VM_Version::initialize() { // The AES intrinsic stubs require AES instruction support. if (has_vcipher()) { - if (FLAG_IS_DEFAULT(UseAES)) { + if (FLAG_IS_DEFAULT(UseAES)) { UseAES = true; } } else if (UseAES) { @@ -306,14 +306,13 @@ void VM_Version::initialize() { } if (VM_Version::has_vsx()) { - if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { - UseGHASHIntrinsics = true; - } + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } } else if (UseGHASHIntrinsics) { - if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { - warning("GHASH intrinsics are not available on this CPU"); - } - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } if (FLAG_IS_DEFAULT(UseFMA)) { From 24f3379a91df0320baf2a23dcf6246f6c16cc0ba Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 28 Jan 2025 09:29:59 -0500 Subject: [PATCH 37/56] permute vHigh,vLow --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index b07b5ffac6069..e91fdb18c477c 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -682,6 +682,7 @@ address generate_ghash_processBlocks() { VectorRegister vHigh = VR14; VectorRegister vLow = VR15; VectorRegister vState = VR16; + VectorRegister vPerm = VR17; VectorRegister vConstC2 = VR19; Label L_end, L_aligned, L_error; @@ -711,6 +712,12 @@ address generate_ghash_processBlocks() { #endif __ clrldi(blocks, blocks, 32); __ mtctr(blocks); + __ li(temp1, 0); + __ lvsl(loadOrder, temp1); + #ifdef VM_LITTLE_ENDIAN + __ vspltisb(vTmp12, 0xf); + __ vxor(loadOrder, loadOrder, vTmp12); + #endif // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. // // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, @@ -739,23 +746,19 @@ address generate_ghash_processBlocks() { __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) __ li(temp1, 0); __ lvx(vHigh, temp1, data); - __ lvsl(loadOrder, temp1, data); + __ lvsl(vPerm, temp1, data); __ addi(data, data, 16); __ lvx(vLow, temp1, data); - __ vec_perm(vH, vHigh, vLow, loadOrder); + __ vec_perm(vHigh, vHigh, vHigh, loadOrder); + __ vec_perm(vLow, vLow, vLow, loadOrder); + __ vec_perm(vH, vLow, vHigh, vPerm); __ subi(data, data, 16); __ b(L_end); __ bind(L_aligned); __ li(temp1, 0); __ lvx(vH, temp1, data); - __ bind(L_end); - __ li(temp1, 0); - __ lvsl(loadOrder, temp1); - #ifdef VM_LITTLE_ENDIAN - __ vspltisb(vTmp12, 0xf); - __ vxor(loadOrder, loadOrder, vTmp12); - #endif __ vec_perm(vH, vH, vH, loadOrder); + __ bind(L_end); __ vxor(vH, vH, vState); // Perform GCM multiplication __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H From b4aa7a88b12ad41ca851d779564c21cce1d09b1b Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 28 Jan 2025 11:19:21 -0500 Subject: [PATCH 38/56] restore chnges --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 26 +++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index e91fdb18c477c..5119e2cc1f575 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -23,7 +23,6 @@ * */ -#include "precompiled.hpp" #include "asm/macroAssembler.inline.hpp" #include "compiler/oopMap.hpp" #include "gc/shared/barrierSet.hpp" @@ -622,7 +621,7 @@ class StubGenerator: public StubCodeGenerator { // Don't generate, rather use C++ code. address generate_verify_oop() { // this is actually a `FunctionDescriptor*'. - address start = 0; + address start = nullptr; #if !defined(PRODUCT) start = CAST_FROM_FN_PTR(address, verify_oop_helper); @@ -2194,7 +2193,8 @@ address generate_ghash_processBlocks() { void generate_type_check(Register sub_klass, Register super_check_offset, Register super_klass, - Register temp, + Register temp1, + Register temp2, Label& L_success) { assert_different_registers(sub_klass, super_check_offset, super_klass); @@ -2202,9 +2202,9 @@ address generate_ghash_processBlocks() { Label L_miss; - __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, nullptr, + __ check_klass_subtype_fast_path(sub_klass, super_klass, temp1, temp2, &L_success, &L_miss, nullptr, super_check_offset); - __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success); + __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success); // Fall through on failure! __ bind(L_miss); @@ -2234,8 +2234,7 @@ address generate_ghash_processBlocks() { const Register R10_oop = R10_ARG8; // actual oop copied const Register R11_klass = R11_scratch1; // oop._klass const Register R12_tmp = R12_scratch2; - - const Register R2_minus1 = R2; + const Register R2_tmp = R2; //__ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", name); @@ -2273,7 +2272,6 @@ address generate_ghash_processBlocks() { Label load_element, store_element, store_null, success, do_epilogue; __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it. __ li(R8_offset, 0); // Offset from start of arrays. - __ li(R2_minus1, -1); __ bne(CCR0, load_element); // Empty array: Nothing to do. @@ -2301,7 +2299,7 @@ address generate_ghash_processBlocks() { } __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset. - __ add_(R9_remain, R2_minus1, R9_remain); // Decrement the count. + __ addic_(R9_remain, R9_remain, -1); // Decrement the count. __ beq(CCR0, success); // ======== loop entry is here ======== @@ -2321,7 +2319,7 @@ address generate_ghash_processBlocks() { __ load_klass(R11_klass, R10_oop); // Query the object klass. - generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, + generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, R2_tmp, // Branch to this on success: store_element); // ======== end loop ======== @@ -2655,7 +2653,7 @@ address generate_ghash_processBlocks() { int sco_offset = in_bytes(Klass::super_check_offset_offset()); __ lwz(sco_temp, sco_offset, dst_klass); generate_type_check(src_klass, sco_temp, dst_klass, - temp, L_disjoint_plain_copy); + temp, /* temp */ R10_ARG8, L_disjoint_plain_copy); // Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); @@ -4613,9 +4611,9 @@ address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { r_bitmap = R11_scratch1, result = R8_ARG6; - __ lookup_secondary_supers_table(r_sub_klass, r_super_klass, - r_array_base, r_array_length, r_array_index, - r_bitmap, result, super_klass_index); + __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, + r_array_base, r_array_length, r_array_index, + r_bitmap, result, super_klass_index) __ blr(); return start; From fc2f1c30e8e57a00d5ab466eb030c6f2bb894e03 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 28 Jan 2025 11:20:06 -0500 Subject: [PATCH 39/56] restore chnges --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 5119e2cc1f575..7d28df1410fd6 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -4613,7 +4613,7 @@ address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) { __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, r_array_base, r_array_length, r_array_index, - r_bitmap, result, super_klass_index) + r_bitmap, result, super_klass_index); __ blr(); return start; From 068a248c281faac6a318e1efa721fe4679c37750 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 5 Feb 2025 03:35:13 -0500 Subject: [PATCH 40/56] adapt Condition registers --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 4b6ca40e2a2e7..b603985800576 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -622,8 +622,8 @@ address generate_ghash_processBlocks() { __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H #ifdef ASSERT - __ cmpwi(CCR0, blocks, 0); - __ beq(CCR0, L_error); + __ cmpwi(CR0, blocks, 0); + __ beq(CR0, L_error); #endif __ clrldi(blocks, blocks, 32); __ mtctr(blocks); @@ -657,8 +657,8 @@ address generate_ghash_processBlocks() { __ vspltisb(vZero, 0); __ li(temp1, 0); __ andi(temp1, data, 15); - __ cmpwi(CCR0, temp1, 0); - __ beq(CCR0, L_aligned); // Check if address is aligned (mask lower 4 bits) + __ cmpwi(CR0, temp1, 0); + __ beq(CR0, L_aligned); // Check if address is aligned (mask lower 4 bits) __ li(temp1, 0); __ lvx(vHigh, temp1, data); __ lvsl(vPerm, temp1, data); From 79d470bf48d0b657e12c044af76afa48acc42179 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 10 Feb 2025 10:53:15 -0500 Subject: [PATCH 41/56] Aligned accesses --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 61 +++++++++++++++-------- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 8 +-- 2 files changed, 41 insertions(+), 28 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index b603985800576..cde7a8de79238 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -599,7 +599,7 @@ address generate_ghash_processBlocks() { VectorRegister vState = VR16; VectorRegister vPerm = VR17; VectorRegister vConstC2 = VR19; - Label L_end, L_aligned, L_error; + Label L_end, L_aligned, L_error, L_trigger_assert, L_skip_assert; __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); @@ -622,9 +622,13 @@ address generate_ghash_processBlocks() { __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H #ifdef ASSERT - __ cmpwi(CR0, blocks, 0); - __ beq(CR0, L_error); + __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero + __ beq(CR0, L_trigger_assert); + __ b(L_skip_assert); // Skip assertion if 'blocks' is nonzero + __ bind(L_trigger_assert); + __ asm_assert_eq("blocks should NOT be zero"); #endif + __ bind(L_skip_assert); __ clrldi(blocks, blocks, 32); __ mtctr(blocks); __ li(temp1, 0); @@ -652,13 +656,37 @@ address generate_ghash_processBlocks() { // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // - Label loop; - __ bind(loop); + //__ stop("ghash loop"); + Label L_aligned_loop, L_store, L_unaligned_loop; + __ andi(temp1, data, 15); + __ cmpwi(CR0, temp1, 0); + __ beq(CR0, L_aligned_loop); + __ b(L_unaligned_loop); + __ bind(L_aligned_loop); __ vspltisb(vZero, 0); __ li(temp1, 0); - __ andi(temp1, data, 15); - __ cmpwi(CR0, temp1, 0); - __ beq(CR0, L_aligned); // Check if address is aligned (mask lower 4 bits) + __ lvx(vH, temp1, data); + __ vec_perm(vH, vH, vH, loadOrder); + __ vxor(vH, vH, vState); + __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H + __ vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H + __ vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkeyH + __ vpmsumd(vTmp7, vTmp4, vConstC2); // reduction + __ vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M + __ vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M + __ vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Combine L and mL (partial result for lower half) + __ vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Combine H and mH (partial result for upper half) + __ vsldoi(vTmp4, vTmp4, vTmp4, 8); // swap + __ vxor(vTmp4, vTmp4, vTmp7); // reduction using the reduction constant + __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap + __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant + __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products + __ vxor(vState, vTmp4, vTmp10); + __ addi(data, data, 16); + __ bdnz(L_aligned_loop); + __ b(L_store); + __ bind(L_unaligned_loop); + __ vspltisb(vZero, 0); __ li(temp1, 0); __ lvx(vHigh, temp1, data); __ lvsl(vPerm, temp1, data); @@ -668,12 +696,6 @@ address generate_ghash_processBlocks() { __ vec_perm(vLow, vLow, vLow, loadOrder); __ vec_perm(vH, vLow, vHigh, vPerm); __ subi(data, data, 16); - __ b(L_end); - __ bind(L_aligned); - __ li(temp1, 0); - __ lvx(vH, temp1, data); - __ vec_perm(vH, vH, vH, loadOrder); - __ bind(L_end); __ vxor(vH, vH, vState); // Perform GCM multiplication __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H @@ -690,15 +712,11 @@ address generate_ghash_processBlocks() { __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products __ vxor(vZero, vTmp4, vTmp10); - __ vmr(vState, vZero); __ addi(data, data, 16); - __ bdnz(loop); - __ stxvd2x(vZero->to_vsr(), state); + __ bdnz(L_unaligned_loop); + __ bind(L_store); + __ stxvd2x(vState->to_vsr(), state); __ blr(); - #ifdef ASSERT - __ bind(L_error); - __ stop("ghash_processBlocks : number of blocks must be positive"); - #endif return start; } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic @@ -5083,6 +5101,7 @@ void generate_lookup_secondary_supers_table_stub() { StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); } + if (UseGHASHIntrinsics) { StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); } diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index d5612553e7af4..962c8c4e80215 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -281,7 +281,7 @@ void VM_Version::initialize() { // The AES intrinsic stubs require AES instruction support. if (has_vcipher()) { - if (FLAG_IS_DEFAULT(UseAES)) { + if (FLAG_IS_DEFAULT(UseAES)) { UseAES = true; } } else if (UseAES) { @@ -305,15 +305,9 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } - if (VM_Version::has_vsx()) { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { UseGHASHIntrinsics = true; } - } else if (UseGHASHIntrinsics) { - if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) - warning("GHASH intrinsics are not available on this CPU"); - FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); - } if (FLAG_IS_DEFAULT(UseFMA)) { FLAG_SET_DEFAULT(UseFMA, true); From 127237512ffdd22d46be9305f51f4cf4a07b4426 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 10 Feb 2025 12:37:46 -0500 Subject: [PATCH 42/56] common code function --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 68 +++++++++++------------ 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index cde7a8de79238..9433da15050f7 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -546,6 +546,31 @@ class StubGenerator: public StubCodeGenerator { return start; } +static void computeGCMProduct(MacroAssembler* masm, + VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, + VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, + VectorRegister vTmp4, VectorRegister vTmp5, VectorRegister vTmp6, + VectorRegister vTmp7, VectorRegister vTmp8, VectorRegister vTmp9, + VectorRegister vTmp10, VectorRegister vTmp11, Register data) { + assert(masm != nullptr, "MacroAssembler pointer is null"); + masm->vxor(vH, vH, vState); + masm->vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H + masm->vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H + masm->vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkey H + masm->vpmsumd(vTmp7, vTmp4, vConstC2); // Reduction + masm->vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M + masm->vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M + masm->vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Partial result for lower half + masm->vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Partial result for upper half + masm->vsldoi(vTmp4, vTmp4, vTmp4, 8); // Swap + masm->vxor(vTmp4, vTmp4, vTmp7); // Reduction using constant + masm->vsldoi(vTmp10, vTmp4, vTmp4, 8); // Swap + masm->vpmsumd(vTmp4, vTmp4, vConstC2); // Reduction + masm->vxor(vTmp10, vTmp10, vTmp6); // Combine reduced Low & High products + masm->vxor(vState, vTmp4, vTmp10); + masm->addi(data, data, 16); +} + // Generate stub for ghash process blocks. // // Arguments for generated stub: @@ -656,63 +681,32 @@ address generate_ghash_processBlocks() { // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // - //__ stop("ghash loop"); Label L_aligned_loop, L_store, L_unaligned_loop; __ andi(temp1, data, 15); __ cmpwi(CR0, temp1, 0); __ beq(CR0, L_aligned_loop); + __ li(temp1,0); + __ lvsl(vPerm, temp1, data); __ b(L_unaligned_loop); __ bind(L_aligned_loop); __ vspltisb(vZero, 0); - __ li(temp1, 0); __ lvx(vH, temp1, data); __ vec_perm(vH, vH, vH, loadOrder); - __ vxor(vH, vH, vState); - __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H - __ vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H - __ vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkeyH - __ vpmsumd(vTmp7, vTmp4, vConstC2); // reduction - __ vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M - __ vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M - __ vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Combine L and mL (partial result for lower half) - __ vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Combine H and mH (partial result for upper half) - __ vsldoi(vTmp4, vTmp4, vTmp4, 8); // swap - __ vxor(vTmp4, vTmp4, vTmp7); // reduction using the reduction constant - __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap - __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant - __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products - __ vxor(vState, vTmp4, vTmp10); - __ addi(data, data, 16); + computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); __ bdnz(L_aligned_loop); __ b(L_store); __ bind(L_unaligned_loop); __ vspltisb(vZero, 0); - __ li(temp1, 0); __ lvx(vHigh, temp1, data); - __ lvsl(vPerm, temp1, data); __ addi(data, data, 16); __ lvx(vLow, temp1, data); __ vec_perm(vHigh, vHigh, vHigh, loadOrder); __ vec_perm(vLow, vLow, vLow, loadOrder); __ vec_perm(vH, vLow, vHigh, vPerm); __ subi(data, data, 16); - __ vxor(vH, vH, vState); - // Perform GCM multiplication - __ vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H - __ vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H - __ vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkeyH - __ vpmsumd(vTmp7, vTmp4, vConstC2); // reduction - __ vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M - __ vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M - __ vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Combine L and mL (partial result for lower half) - __ vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Combine H and mH (partial result for upper half) - __ vsldoi(vTmp4, vTmp4, vTmp4, 8); // swap - __ vxor(vTmp4, vTmp4, vTmp7); // reduction using the reduction constant - __ vsldoi(vTmp10, vTmp4, vTmp4, 8); // swap - __ vpmsumd(vTmp4, vTmp4, vConstC2); // reduction using the reduction constant - __ vxor(vTmp10, vTmp10, vTmp6); // Combine the reduced Low and High products - __ vxor(vZero, vTmp4, vTmp10); - __ addi(data, data, 16); + computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); __ bdnz(L_unaligned_loop); __ bind(L_store); __ stxvd2x(vState->to_vsr(), state); From cf3f1d4794cfcfcc041c11156b06ebbf966efcb0 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 11 Feb 2025 02:11:16 -0500 Subject: [PATCH 43/56] common code function --- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index 962c8c4e80215..ca654579baff4 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -305,9 +305,15 @@ void VM_Version::initialize() { FLAG_SET_DEFAULT(UseAESCTRIntrinsics, false); } + if (VM_Version::has_vsx()) { if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { UseGHASHIntrinsics = true; } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } if (FLAG_IS_DEFAULT(UseFMA)) { FLAG_SET_DEFAULT(UseFMA, true); From a7d9a9603ccf0aa044c7ff39ffd61e3ffa13aa3b Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 11 Feb 2025 02:12:08 -0500 Subject: [PATCH 44/56] common code function --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 9433da15050f7..0a2c85cba5186 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -546,7 +546,7 @@ class StubGenerator: public StubCodeGenerator { return start; } -static void computeGCMProduct(MacroAssembler* masm, +static void computeGCMProduct(MacroAssembler* masm, VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, VectorRegister vTmp4, VectorRegister vTmp5, VectorRegister vTmp6, From 5b94a7a4fa0563f85920bedb1d923c179eb44ebb Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Mon, 17 Feb 2025 09:01:27 -0500 Subject: [PATCH 45/56] Single load inside loop --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 311 +++++++++++----------- 1 file changed, 160 insertions(+), 151 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 0a2c85cba5186..070d9d9eb03bd 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -546,7 +546,15 @@ class StubGenerator: public StubCodeGenerator { return start; } -static void computeGCMProduct(MacroAssembler* masm, + // Computes the Galois/Counter Mode (GCM) product and reduction. + // + // This function performs polynomial multiplication of the subkey H with + // the current GHASH state using vectorized polynomial multiplication (`vpmsumd`). + // The subkey H is divided into lower, middle, and higher halves. + // The multiplication results are reduced using `vConstC2` to stay within GF(2^128). + // The final computed value is stored back into `vState`. + + static void computeGCMProduct(MacroAssembler* masm, VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, VectorRegister vTmp4, VectorRegister vTmp5, VectorRegister vTmp6, @@ -554,165 +562,166 @@ static void computeGCMProduct(MacroAssembler* masm, VectorRegister vTmp10, VectorRegister vTmp11, Register data) { assert(masm != nullptr, "MacroAssembler pointer is null"); masm->vxor(vH, vH, vState); - masm->vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H - masm->vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H - masm->vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkey H - masm->vpmsumd(vTmp7, vTmp4, vConstC2); // Reduction - masm->vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M - masm->vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M - masm->vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Partial result for lower half - masm->vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Partial result for upper half - masm->vsldoi(vTmp4, vTmp4, vTmp4, 8); // Swap - masm->vxor(vTmp4, vTmp4, vTmp7); // Reduction using constant - masm->vsldoi(vTmp10, vTmp4, vTmp4, 8); // Swap - masm->vpmsumd(vTmp4, vTmp4, vConstC2); // Reduction - masm->vxor(vTmp10, vTmp10, vTmp6); // Combine reduced Low & High products + masm->vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H + masm->vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H + masm->vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkey H + masm->vpmsumd(vTmp7, vTmp4, vConstC2); // Reduction + masm->vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M + masm->vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M + masm->vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Partial result for lower half + masm->vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Partial result for upper half + masm->vsldoi(vTmp4, vTmp4, vTmp4, 8); // Swap + masm->vxor(vTmp4, vTmp4, vTmp7); // Reduction using constant + masm->vsldoi(vTmp10, vTmp4, vTmp4, 8); // Swap + masm->vpmsumd(vTmp4, vTmp4, vConstC2); // Reduction + masm->vxor(vTmp10, vTmp10, vTmp6); // Combine reduced Low & High products masm->vxor(vState, vTmp4, vTmp10); - masm->addi(data, data, 16); -} + } -// Generate stub for ghash process blocks. -// -// Arguments for generated stub: -// state: R3_ARG1 (long[] state) -// subkeyH: R4_ARG2 (long[] subH) -// data: R5_ARG3 (byte[] data) -// blocks: R6_ARG4 (number of 16-byte blocks to process) -// -// The polynomials are processed in bit-reflected order for efficiency reasons. -// This optimization leverages the structure of the Galois field arithmetic -// to minimize the number of bit manipulations required during multiplication. -// For an explanation of how this works, refer : -// Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, -// Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel® -// Architecture Processor" -// http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf -// -// -address generate_ghash_processBlocks() { - StubCodeMark mark(this, "StubRoutines", "ghash"); - address start = __ function_entry(); - - // Registers for parameters - Register state = R3_ARG1; // long[] state - Register subkeyH = R4_ARG2; // long[] subH - Register data = R5_ARG3; // byte[] data - Register blocks = R6_ARG4; - Register temp1 = R8; - Register temp2 = R9; - Register temp3 = R10; - Register temp4 = R11; - Register align = data; - Register load = R12; - // Vector Registers - VectorRegister vZero = VR0; - VectorRegister vH = VR1; - VectorRegister vLowerH = VR2; - VectorRegister vHigherH = VR3; - VectorRegister vTmp4 = VR4; - VectorRegister vTmp5 = VR5; - VectorRegister vTmp6 = VR6; - VectorRegister vTmp7 = VR7; - VectorRegister vTmp8 = VR8; - VectorRegister vTmp9 = VR9; - VectorRegister vTmp10 = VR10; - VectorRegister vTmp11 = VR11; - VectorRegister vTmp12 = VR12; - VectorRegister loadOrder = VR13; - VectorRegister vHigh = VR14; - VectorRegister vLow = VR15; - VectorRegister vState = VR16; - VectorRegister vPerm = VR17; - VectorRegister vConstC2 = VR19; - Label L_end, L_aligned, L_error, L_trigger_assert, L_skip_assert; - - __ li(temp1, 0xc2); - __ sldi(temp1, temp1, 56); - __ vspltisb(vZero, 0); - __ mtvrd(vConstC2, temp1); - __ lxvd2x(vH->to_vsr(), subkeyH); - __ lxvd2x(vState->to_vsr(), state); - // Operations to obtain lower and higher bytes of subkey H. - __ vspltisb(vTmp7, 1); - __ vspltisb(vTmp10, 7); - __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 - __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 - __ vsplt(vTmp9, 0, vH); // MSB of H - __ vsl(vH, vH, vTmp7); // Carry = H<<7 - __ vsrab(vTmp9, vTmp9, vTmp10); - __ vand(vTmp9, vTmp9, vTmp8); // Carry - __ vxor(vTmp10, vH, vTmp9); - __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H - __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L - __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H - #ifdef ASSERT - __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero - __ beq(CR0, L_trigger_assert); - __ b(L_skip_assert); // Skip assertion if 'blocks' is nonzero - __ bind(L_trigger_assert); - __ asm_assert_eq("blocks should NOT be zero"); - #endif - __ bind(L_skip_assert); - __ clrldi(blocks, blocks, 32); - __ mtctr(blocks); - __ li(temp1, 0); - __ lvsl(loadOrder, temp1); - #ifdef VM_LITTLE_ENDIAN - __ vspltisb(vTmp12, 0xf); - __ vxor(loadOrder, loadOrder, vTmp12); - #endif - // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. + // Generate stub for ghash process blocks. // - // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, - // performing three 128-bit multiplications and combining the results efficiently. - // - // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) - // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // Arguments for generated stub: + // state: R3_ARG1 (long[] state) + // subkeyH: R4_ARG2 (long[] subH) + // data: R5_ARG3 (byte[] data) + // blocks: R6_ARG4 (number of 16-byte blocks to process) // - // Inputs: - // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half). - // - vLowerH: Lower half of the subkey H (A0). - // - vHigherH: Higher half of the subkey H (A1). - // - vConstC2: Constant used for reduction (for final processing). + // The polynomials are processed in bit-reflected order for efficiency reasons. + // This optimization leverages the structure of the Galois field arithmetic + // to minimize the number of bit manipulations required during multiplication. + // For an explanation of how this works, refer : + // Vinodh Gopal, Erdinc Ozturk, Wajdi Feghali, Jim Guilford, Gil Wolrich, + // Martin Dixon. "Optimized Galois-Counter-Mode Implementation on Intel® + // Architecture Processor" + // http://web.archive.org/web/20130609111954/http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/communications-ia-galois-counter-mode-paper.pdf // - // References: - // Shay Gueron, Michael E. Kounavis. - // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" - // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // - Label L_aligned_loop, L_store, L_unaligned_loop; - __ andi(temp1, data, 15); - __ cmpwi(CR0, temp1, 0); - __ beq(CR0, L_aligned_loop); - __ li(temp1,0); - __ lvsl(vPerm, temp1, data); - __ b(L_unaligned_loop); - __ bind(L_aligned_loop); + address generate_ghash_processBlocks() { + StubCodeMark mark(this, "StubRoutines", "ghash"); + address start = __ function_entry(); + + // Registers for parameters + Register state = R3_ARG1; // long[] state + Register subkeyH = R4_ARG2; // long[] subH + Register data = R5_ARG3; // byte[] data + Register blocks = R6_ARG4; + Register temp1 = R8; + Register temp2 = R9; + Register temp3 = R10; + Register temp4 = R11; + Register align = data; + Register load = R12; + // Vector Registers + VectorRegister vZero = VR0; + VectorRegister vH = VR1; + VectorRegister vLowerH = VR2; + VectorRegister vHigherH = VR3; + VectorRegister vTmp4 = VR4; + VectorRegister vTmp5 = VR5; + VectorRegister vTmp6 = VR6; + VectorRegister vTmp7 = VR7; + VectorRegister vTmp8 = VR8; + VectorRegister vTmp9 = VR9; + VectorRegister vTmp10 = VR10; + VectorRegister vTmp11 = VR11; + VectorRegister vTmp12 = VR12; + VectorRegister loadOrder = VR13; + VectorRegister vHigh = VR14; + VectorRegister vLow = VR15; + VectorRegister vState = VR16; + VectorRegister vPerm = VR17; + VectorRegister vConstC2 = VR19; + Label L_end, L_aligned, L_error, L_trigger_assert, L_skip_assert; + + __ li(temp1, 0xc2); + __ sldi(temp1, temp1, 56); __ vspltisb(vZero, 0); - __ lvx(vH, temp1, data); - __ vec_perm(vH, vH, vH, loadOrder); - computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); - __ bdnz(L_aligned_loop); - __ b(L_store); - __ bind(L_unaligned_loop); + __ mtvrd(vConstC2, temp1); + __ lxvd2x(vH->to_vsr(), subkeyH); + __ lxvd2x(vState->to_vsr(), state); + // Operations to obtain lower and higher bytes of subkey H. + __ vspltisb(vTmp7, 1); + __ vspltisb(vTmp10, 7); + __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 + __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 + __ vsplt(vTmp9, 0, vH); // MSB of H + __ vsl(vH, vH, vTmp7); // Carry = H<<7 + __ vsrab(vTmp9, vTmp9, vTmp10); + __ vand(vTmp9, vTmp9, vTmp8); // Carry + __ vxor(vTmp10, vH, vTmp9); + __ vsldoi(vConstC2, vZero, vConstC2, 8); + __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H + __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L + __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H +#ifdef ASSERT + __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero + __ beq(CR0, L_trigger_assert); + __ b(L_skip_assert); // Skip assertion if 'blocks' is nonzero + __ bind(L_trigger_assert); + __ asm_assert_eq("blocks should NOT be zero"); +#endif + __ bind(L_skip_assert); + __ clrldi(blocks, blocks, 32); + __ mtctr(blocks); + __ li(temp1, 0); + __ lvsl(loadOrder, temp1); +#ifdef VM_LITTLE_ENDIAN + __ vspltisb(vTmp12, 0xf); + __ vxor(loadOrder, loadOrder, vTmp12); +#endif + // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. + // + // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, + // performing three 128-bit multiplications and combining the results efficiently. + // + // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) + // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // + // Inputs: + // - vH: The data vector (state), containing both B0 (lower half) and B1 (higher half). + // - vLowerH: Lower half of the subkey H (A0). + // - vHigherH: Higher half of the subkey H (A1). + // - vConstC2: Constant used for reduction (for final processing). + // + // References: + // Shay Gueron, Michael E. Kounavis. + // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" + // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 + // + Label L_aligned_loop, L_store, L_unaligned_loop; __ vspltisb(vZero, 0); + __ andi(temp1, data, 15); + __ cmpwi(CR0, temp1, 0); + __ beq(CR0, L_aligned_loop); + __ li(temp1,0); + __ lvsl(vPerm, temp1, data); __ lvx(vHigh, temp1, data); - __ addi(data, data, 16); - __ lvx(vLow, temp1, data); - __ vec_perm(vHigh, vHigh, vHigh, loadOrder); - __ vec_perm(vLow, vLow, vLow, loadOrder); - __ vec_perm(vH, vLow, vHigh, vPerm); - __ subi(data, data, 16); - computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); - __ bdnz(L_unaligned_loop); - __ bind(L_store); - __ stxvd2x(vState->to_vsr(), state); - __ blr(); - return start; -} + __ b(L_unaligned_loop); + __ bind(L_aligned_loop); + __ lvx(vH, temp1, data); + __ vec_perm(vH, vH, vH, loadOrder); + computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); + __ addi(data, data, 16); + __ bdnz(L_aligned_loop); + __ b(L_store); + __ bind(L_unaligned_loop); + __ addi(data, data, 16); + __ lvx(vLow, temp1, data); + __ vec_perm(vTmp4, vHigh, vHigh, loadOrder); + __ vec_perm(vTmp5, vLow, vLow, loadOrder); + __ vec_perm(vH, vTmp5, vTmp4, vPerm); + __ subi(data, data, 16); + computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); + __ vmr(vHigh, vLow); + __ addi(data, data, 16); + __ bdnz(L_unaligned_loop); + __ bind(L_store); + __ stxvd2x(vState->to_vsr(), state); + __ blr(); + return start; + } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic // // The code is implemented(ported from sparc) as we believe it benefits JVM98, however From b3fe9d6a6a72a59fe4166d9476dae006e9bf3346 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 19 Feb 2025 03:10:03 -0500 Subject: [PATCH 46/56] remove not needed variables --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 070d9d9eb03bd..f8b0f051dea7d 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -553,13 +553,12 @@ class StubGenerator: public StubCodeGenerator { // The subkey H is divided into lower, middle, and higher halves. // The multiplication results are reduced using `vConstC2` to stay within GF(2^128). // The final computed value is stored back into `vState`. - static void computeGCMProduct(MacroAssembler* masm, VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, VectorRegister vTmp4, VectorRegister vTmp5, VectorRegister vTmp6, VectorRegister vTmp7, VectorRegister vTmp8, VectorRegister vTmp9, - VectorRegister vTmp10, VectorRegister vTmp11, Register data) { + VectorRegister vTmp10, VectorRegister vTmp11) { assert(masm != nullptr, "MacroAssembler pointer is null"); masm->vxor(vH, vH, vState); masm->vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H @@ -663,7 +662,6 @@ class StubGenerator: public StubCodeGenerator { __ bind(L_skip_assert); __ clrldi(blocks, blocks, 32); __ mtctr(blocks); - __ li(temp1, 0); __ lvsl(loadOrder, temp1); #ifdef VM_LITTLE_ENDIAN __ vspltisb(vTmp12, 0xf); @@ -701,7 +699,7 @@ class StubGenerator: public StubCodeGenerator { __ lvx(vH, temp1, data); __ vec_perm(vH, vH, vH, loadOrder); computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ addi(data, data, 16); __ bdnz(L_aligned_loop); __ b(L_store); @@ -711,11 +709,9 @@ class StubGenerator: public StubCodeGenerator { __ vec_perm(vTmp4, vHigh, vHigh, loadOrder); __ vec_perm(vTmp5, vLow, vLow, loadOrder); __ vec_perm(vH, vTmp5, vTmp4, vPerm); - __ subi(data, data, 16); computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11, data); + vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ vmr(vHigh, vLow); - __ addi(data, data, 16); __ bdnz(L_unaligned_loop); __ bind(L_store); __ stxvd2x(vState->to_vsr(), state); From b37b09da9874a94a4a5402190bf553ab6018bc2c Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 19 Feb 2025 03:36:38 -0500 Subject: [PATCH 47/56] remove not needed variables --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index f8b0f051dea7d..88bb51307504a 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -687,7 +687,6 @@ class StubGenerator: public StubCodeGenerator { // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // Label L_aligned_loop, L_store, L_unaligned_loop; - __ vspltisb(vZero, 0); __ andi(temp1, data, 15); __ cmpwi(CR0, temp1, 0); __ beq(CR0, L_aligned_loop); From 68565d447ee7aee4e9f21ae28764f6a84ba52e64 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 20 Feb 2025 10:36:21 -0500 Subject: [PATCH 48/56] change branch and remove not needed variables --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 29 ++++++++++------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 88bb51307504a..2acc72fd6e6e9 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -605,11 +605,6 @@ class StubGenerator: public StubCodeGenerator { Register data = R5_ARG3; // byte[] data Register blocks = R6_ARG4; Register temp1 = R8; - Register temp2 = R9; - Register temp3 = R10; - Register temp4 = R11; - Register align = data; - Register load = R12; // Vector Registers VectorRegister vZero = VR0; VectorRegister vH = VR1; @@ -630,7 +625,6 @@ class StubGenerator: public StubCodeGenerator { VectorRegister vState = VR16; VectorRegister vPerm = VR17; VectorRegister vConstC2 = VR19; - Label L_end, L_aligned, L_error, L_trigger_assert, L_skip_assert; __ li(temp1, 0xc2); __ sldi(temp1, temp1, 56); @@ -654,12 +648,8 @@ class StubGenerator: public StubCodeGenerator { __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H #ifdef ASSERT __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero - __ beq(CR0, L_trigger_assert); - __ b(L_skip_assert); // Skip assertion if 'blocks' is nonzero - __ bind(L_trigger_assert); - __ asm_assert_eq("blocks should NOT be zero"); + __ asm_assert_ne("blocks should NOT be zero"); #endif - __ bind(L_skip_assert); __ clrldi(blocks, blocks, 32); __ mtctr(blocks); __ lvsl(loadOrder, temp1); @@ -686,14 +676,11 @@ class StubGenerator: public StubCodeGenerator { // "Intel® Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode" // https://web.archive.org/web/20110609115824/https://software.intel.com/file/24918 // - Label L_aligned_loop, L_store, L_unaligned_loop; + Label L_aligned_loop, L_store, L_unaligned_loop, L_initialize_unaligned_loop; __ andi(temp1, data, 15); __ cmpwi(CR0, temp1, 0); - __ beq(CR0, L_aligned_loop); - __ li(temp1,0); - __ lvsl(vPerm, temp1, data); - __ lvx(vHigh, temp1, data); - __ b(L_unaligned_loop); + __ bne(CR0, L_initialize_unaligned_loop); + __ bind(L_aligned_loop); __ lvx(vH, temp1, data); __ vec_perm(vH, vH, vH, loadOrder); @@ -702,6 +689,12 @@ class StubGenerator: public StubCodeGenerator { __ addi(data, data, 16); __ bdnz(L_aligned_loop); __ b(L_store); + + __ bind(L_initialize_unaligned_loop); + __ li(temp1,0); + __ lvsl(vPerm, temp1, data); + __ lvx(vHigh, temp1, data); + __ bind(L_unaligned_loop); __ addi(data, data, 16); __ lvx(vLow, temp1, data); @@ -712,9 +705,11 @@ class StubGenerator: public StubCodeGenerator { vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ vmr(vHigh, vLow); __ bdnz(L_unaligned_loop); + __ bind(L_store); __ stxvd2x(vState->to_vsr(), state); __ blr(); + return start; } // -XX:+OptimizeFill : convert fill/copy loops into intrinsic From 467af71cd553e842cc87b42429be3011111207c6 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 20 Feb 2025 10:37:46 -0500 Subject: [PATCH 49/56] change branch and remove not needed variables --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 2acc72fd6e6e9..9b0ed6e80cf18 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -687,7 +687,7 @@ class StubGenerator: public StubCodeGenerator { computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ addi(data, data, 16); - __ bdnz(L_aligned_loop); + __ bdnz(L_aligned_loop); __ b(L_store); __ bind(L_initialize_unaligned_loop); @@ -704,7 +704,7 @@ class StubGenerator: public StubCodeGenerator { computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ vmr(vHigh, vLow); - __ bdnz(L_unaligned_loop); + __ bdnz(L_unaligned_loop); __ bind(L_store); __ stxvd2x(vState->to_vsr(), state); From 55ba8867f62330b784973d0ebf51e0a0c11e8dad Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 26 Feb 2025 04:34:20 -0500 Subject: [PATCH 50/56] change pattern for Linux, fix for AIX --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 19 ++++++++++++++----- src/hotspot/cpu/ppc/vm_version_ppc.cpp | 3 ++- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 9b0ed6e80cf18..58d5edfbfb00b 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -656,7 +656,11 @@ class StubGenerator: public StubCodeGenerator { #ifdef VM_LITTLE_ENDIAN __ vspltisb(vTmp12, 0xf); __ vxor(loadOrder, loadOrder, vTmp12); +#define LE_swap_bytes(x) __ vec_perm(x, x, x, loadOrder) +#else +#define LE_swap_bytes(x) #endif + // This code performs Karatsuba multiplication in Galois fields to compute the GHASH operation. // // The Karatsuba method breaks the multiplication of two 128-bit numbers into smaller parts, @@ -683,7 +687,7 @@ class StubGenerator: public StubCodeGenerator { __ bind(L_aligned_loop); __ lvx(vH, temp1, data); - __ vec_perm(vH, vH, vH, loadOrder); + LE_swap_bytes(vH); computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ addi(data, data, 16); @@ -694,13 +698,18 @@ class StubGenerator: public StubCodeGenerator { __ li(temp1,0); __ lvsl(vPerm, temp1, data); __ lvx(vHigh, temp1, data); - +#ifdef VM_LITTLE_ENDIAN + __ xxspltib(vTmp12->to_vsr(), 31); + __ vxor(vPerm, vPerm, vTmp12); +#endif __ bind(L_unaligned_loop); __ addi(data, data, 16); __ lvx(vLow, temp1, data); - __ vec_perm(vTmp4, vHigh, vHigh, loadOrder); - __ vec_perm(vTmp5, vLow, vLow, loadOrder); - __ vec_perm(vH, vTmp5, vTmp4, vPerm); +#ifdef VM_LITTLE_ENDIAN + __ vec_perm(vH, vHigh, vLow, vPerm); +#else + __ vec_perm(vH, vLow, vHigh, vPerm); +#endif computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ vmr(vHigh, vLow); diff --git a/src/hotspot/cpu/ppc/vm_version_ppc.cpp b/src/hotspot/cpu/ppc/vm_version_ppc.cpp index ca654579baff4..7a96a18d7830e 100644 --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp @@ -310,8 +310,9 @@ void VM_Version::initialize() { UseGHASHIntrinsics = true; } } else if (UseGHASHIntrinsics) { - if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { warning("GHASH intrinsics are not available on this CPU"); + } FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); } From 474b891b3851f4d63c360eb8a952b491efe9dcc8 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Wed, 26 Feb 2025 07:06:28 -0500 Subject: [PATCH 51/56] change pattern for Linux, fix for AIX --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 58d5edfbfb00b..54fafc0e69129 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -705,11 +705,7 @@ class StubGenerator: public StubCodeGenerator { __ bind(L_unaligned_loop); __ addi(data, data, 16); __ lvx(vLow, temp1, data); -#ifdef VM_LITTLE_ENDIAN __ vec_perm(vH, vHigh, vLow, vPerm); -#else - __ vec_perm(vH, vLow, vHigh, vPerm); -#endif computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); __ vmr(vHigh, vLow); From 3bca30f6de5fa9b63295e847923bd82fcb2b397a Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 27 Feb 2025 08:36:48 -0500 Subject: [PATCH 52/56] use vsplitsb --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 71 ++++++++++++----------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 54fafc0e69129..c05095b36e517 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -556,25 +556,25 @@ class StubGenerator: public StubCodeGenerator { static void computeGCMProduct(MacroAssembler* masm, VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, - VectorRegister vTmp4, VectorRegister vTmp5, VectorRegister vTmp6, - VectorRegister vTmp7, VectorRegister vTmp8, VectorRegister vTmp9, - VectorRegister vTmp10, VectorRegister vTmp11) { + VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct, + VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9, + VectorRegister vCombinedResult, VectorRegister vSwappedH) { assert(masm != nullptr, "MacroAssembler pointer is null"); masm->vxor(vH, vH, vState); - masm->vpmsumd(vTmp4, vLowerH, vH); // L : Lower Half of subkey H - masm->vpmsumd(vTmp5, vTmp11, vH); // M : Combined halves of subkey H - masm->vpmsumd(vTmp6, vHigherH, vH); // H : Higher Half of subkey H - masm->vpmsumd(vTmp7, vTmp4, vConstC2); // Reduction - masm->vsldoi(vTmp8, vTmp5, vZero, 8); // mL : Extract the lower 64 bits of M - masm->vsldoi(vTmp9, vZero, vTmp5, 8); // mH : Extract the higher 64 bits of M - masm->vxor(vTmp4, vTmp4, vTmp8); // LL + LL : Partial result for lower half - masm->vxor(vTmp6, vTmp6, vTmp9); // HH + HH : Partial result for upper half - masm->vsldoi(vTmp4, vTmp4, vTmp4, 8); // Swap - masm->vxor(vTmp4, vTmp4, vTmp7); // Reduction using constant - masm->vsldoi(vTmp10, vTmp4, vTmp4, 8); // Swap - masm->vpmsumd(vTmp4, vTmp4, vConstC2); // Reduction - masm->vxor(vTmp10, vTmp10, vTmp6); // Combine reduced Low & High products - masm->vxor(vState, vTmp4, vTmp10); + masm->vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H + masm->vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H + masm->vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H + masm->vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction + masm->vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M + masm->vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M + masm->vxor(vLowProduct, vLowProduct, vTmp8); // LL + LL : Partial result for lower half + masm->vxor(vHighProduct, vHighProduct, vTmp9); // HH + HH : Partial result for upper half + masm->vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap + masm->vxor(vLowProduct, vLowProduct, vReducedLow); // Reduction using constant + masm->vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap + masm->vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction + masm->vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products + masm->vxor(vState, vLowProduct, vCombinedResult); } // Generate stub for ghash process blocks. @@ -610,20 +610,21 @@ class StubGenerator: public StubCodeGenerator { VectorRegister vH = VR1; VectorRegister vLowerH = VR2; VectorRegister vHigherH = VR3; - VectorRegister vTmp4 = VR4; - VectorRegister vTmp5 = VR5; - VectorRegister vTmp6 = VR6; - VectorRegister vTmp7 = VR7; + VectorRegister vLowProduct = VR4; + VectorRegister vMidProduct = VR5; + VectorRegister vHighProduct = VR6; + VectorRegister vReducedLow = VR7; VectorRegister vTmp8 = VR8; VectorRegister vTmp9 = VR9; VectorRegister vTmp10 = VR10; - VectorRegister vTmp11 = VR11; + VectorRegister vSwappedH = VR11; VectorRegister vTmp12 = VR12; VectorRegister loadOrder = VR13; VectorRegister vHigh = VR14; VectorRegister vLow = VR15; VectorRegister vState = VR16; VectorRegister vPerm = VR17; + VectorRegister vCombinedResult = VR18; VectorRegister vConstC2 = VR19; __ li(temp1, 0xc2); @@ -633,21 +634,21 @@ class StubGenerator: public StubCodeGenerator { __ lxvd2x(vH->to_vsr(), subkeyH); __ lxvd2x(vState->to_vsr(), state); // Operations to obtain lower and higher bytes of subkey H. - __ vspltisb(vTmp7, 1); + __ vspltisb(vReducedLow, 1); __ vspltisb(vTmp10, 7); - __ vsldoi(vTmp8, vZero, vTmp7, 1); // 0x1 - __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 - __ vsplt(vTmp9, 0, vH); // MSB of H - __ vsl(vH, vH, vTmp7); // Carry = H<<7 + __ vsldoi(vTmp8, vZero, vReducedLow, 1); // 0x1 + __ vor(vTmp8, vConstC2, vTmp8); // 0xC2...1 + __ vsplt(vTmp9, 0, vH); // MSB of H + __ vsl(vH, vH, vReducedLow); // Carry = H<<7 __ vsrab(vTmp9, vTmp9, vTmp10); - __ vand(vTmp9, vTmp9, vTmp8); // Carry + __ vand(vTmp9, vTmp9, vTmp8); // Carry __ vxor(vTmp10, vH, vTmp9); __ vsldoi(vConstC2, vZero, vConstC2, 8); - __ vsldoi(vTmp11, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H - __ vsldoi(vLowerH, vZero, vTmp11, 8); // H.L - __ vsldoi(vHigherH, vTmp11, vZero, 8); // H.H + __ vsldoi(vSwappedH, vTmp10, vTmp10, 8); // swap Lower and Higher Halves of subkey H + __ vsldoi(vLowerH, vZero, vSwappedH, 8); // H.L + __ vsldoi(vHigherH, vSwappedH, vZero, 8); // H.H #ifdef ASSERT - __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero + __ cmpwi(CR0, blocks, 0); // Compare 'blocks' (R6_ARG4) with zero __ asm_assert_ne("blocks should NOT be zero"); #endif __ clrldi(blocks, blocks, 32); @@ -689,7 +690,7 @@ class StubGenerator: public StubCodeGenerator { __ lvx(vH, temp1, data); LE_swap_bytes(vH); computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); + vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH); __ addi(data, data, 16); __ bdnz(L_aligned_loop); __ b(L_store); @@ -699,7 +700,7 @@ class StubGenerator: public StubCodeGenerator { __ lvsl(vPerm, temp1, data); __ lvx(vHigh, temp1, data); #ifdef VM_LITTLE_ENDIAN - __ xxspltib(vTmp12->to_vsr(), 31); + __ vspltisb(vTmp12, -1); __ vxor(vPerm, vPerm, vTmp12); #endif __ bind(L_unaligned_loop); @@ -707,7 +708,7 @@ class StubGenerator: public StubCodeGenerator { __ lvx(vLow, temp1, data); __ vec_perm(vH, vHigh, vLow, vPerm); computeGCMProduct(_masm, vLowerH, vH, vHigherH, vConstC2, vZero, vState, - vTmp4, vTmp5, vTmp6, vTmp7, vTmp8, vTmp9, vTmp10, vTmp11); + vLowProduct, vMidProduct, vHighProduct, vReducedLow, vTmp8, vTmp9, vCombinedResult, vSwappedH); __ vmr(vHigh, vLow); __ bdnz(L_unaligned_loop); From 90d58e692f65b4cdfdd11dbcdfe5da4342e5b0d5 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 18 Mar 2025 11:35:15 -0500 Subject: [PATCH 53/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index c05095b36e517..a9024040547ef 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -554,12 +554,11 @@ class StubGenerator: public StubCodeGenerator { // The multiplication results are reduced using `vConstC2` to stay within GF(2^128). // The final computed value is stored back into `vState`. static void computeGCMProduct(MacroAssembler* masm, - VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, - VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, - VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct, - VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9, - VectorRegister vCombinedResult, VectorRegister vSwappedH) { - assert(masm != nullptr, "MacroAssembler pointer is null"); + VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, + VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, + VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct, + VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9, + VectorRegister vCombinedResult, VectorRegister vSwappedH) { masm->vxor(vH, vH, vState); masm->vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H masm->vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H @@ -696,7 +695,7 @@ class StubGenerator: public StubCodeGenerator { __ b(L_store); __ bind(L_initialize_unaligned_loop); - __ li(temp1,0); + __ li(temp1, 0); __ lvsl(vPerm, temp1, data); __ lvx(vHigh, temp1, data); #ifdef VM_LITTLE_ENDIAN From 89dfcafd44ca0f9d233ab372c4bd92ecde9d855f Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 18 Mar 2025 11:37:16 -0500 Subject: [PATCH 54/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index a9024040547ef..74fc839bfbf60 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -566,12 +566,12 @@ class StubGenerator: public StubCodeGenerator { masm->vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction masm->vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M masm->vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M - masm->vxor(vLowProduct, vLowProduct, vTmp8); // LL + LL : Partial result for lower half - masm->vxor(vHighProduct, vHighProduct, vTmp9); // HH + HH : Partial result for upper half + masm->vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half + masm->vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half masm->vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap - masm->vxor(vLowProduct, vLowProduct, vReducedLow); // Reduction using constant + masm->vxor(vLowProduct, vLowProduct, vReducedLow); masm->vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap - masm->vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction + masm->vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant masm->vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products masm->vxor(vState, vLowProduct, vCombinedResult); } From a41fdc27b967546407907e41f98e061d3b5014d1 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Tue, 18 Mar 2025 11:38:17 -0500 Subject: [PATCH 55/56] comments --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 74fc839bfbf60..a86887c0f3a9a 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -569,7 +569,7 @@ class StubGenerator: public StubCodeGenerator { masm->vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half masm->vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half masm->vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap - masm->vxor(vLowProduct, vLowProduct, vReducedLow); + masm->vxor(vLowProduct, vLowProduct, vReducedLow); masm->vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap masm->vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant masm->vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products From 423c8685dad48509afdeb46611585a26317bc130 Mon Sep 17 00:00:00 2001 From: suchismith1993 Date: Thu, 24 Apr 2025 10:07:47 -0400 Subject: [PATCH 56/56] masm --- src/hotspot/cpu/ppc/stubGenerator_ppc.cpp | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp index 1881ef7308950..4a0ced42ed4e8 100644 --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp @@ -553,27 +553,27 @@ class StubGenerator: public StubCodeGenerator { // The subkey H is divided into lower, middle, and higher halves. // The multiplication results are reduced using `vConstC2` to stay within GF(2^128). // The final computed value is stored back into `vState`. - static void computeGCMProduct(MacroAssembler* masm, + static void computeGCMProduct(MacroAssembler* _masm, VectorRegister vLowerH, VectorRegister vH, VectorRegister vHigherH, VectorRegister vConstC2, VectorRegister vZero, VectorRegister vState, VectorRegister vLowProduct, VectorRegister vMidProduct, VectorRegister vHighProduct, VectorRegister vReducedLow, VectorRegister vTmp8, VectorRegister vTmp9, VectorRegister vCombinedResult, VectorRegister vSwappedH) { - masm->vxor(vH, vH, vState); - masm->vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H - masm->vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H - masm->vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H - masm->vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction - masm->vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M - masm->vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M - masm->vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half - masm->vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half - masm->vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap - masm->vxor(vLowProduct, vLowProduct, vReducedLow); - masm->vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap - masm->vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant - masm->vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products - masm->vxor(vState, vLowProduct, vCombinedResult); + __ vxor(vH, vH, vState); + __ vpmsumd(vLowProduct, vLowerH, vH); // L : Lower Half of subkey H + __ vpmsumd(vMidProduct, vSwappedH, vH); // M : Combined halves of subkey H + __ vpmsumd(vHighProduct, vHigherH, vH); // H : Higher Half of subkey H + __ vpmsumd(vReducedLow, vLowProduct, vConstC2); // Reduction + __ vsldoi(vTmp8, vMidProduct, vZero, 8); // mL : Extract the lower 64 bits of M + __ vsldoi(vTmp9, vZero, vMidProduct, 8); // mH : Extract the higher 64 bits of M + __ vxor(vLowProduct, vLowProduct, vTmp8); // LL + mL : Partial result for lower half + __ vxor(vHighProduct, vHighProduct, vTmp9); // HH + mH : Partial result for upper half + __ vsldoi(vLowProduct, vLowProduct, vLowProduct, 8); // Swap + __ vxor(vLowProduct, vLowProduct, vReducedLow); + __ vsldoi(vCombinedResult, vLowProduct, vLowProduct, 8); // Swap + __ vpmsumd(vLowProduct, vLowProduct, vConstC2); // Reduction using constant + __ vxor(vCombinedResult, vCombinedResult, vHighProduct); // Combine reduced Low & High products + __ vxor(vState, vLowProduct, vCombinedResult); } // Generate stub for ghash process blocks.