Skip to content

Commit

Permalink
8247645: ChaCha20 intrinsics
Browse files Browse the repository at this point in the history
Reviewed-by: sviswanathan, ngasson, vlivanov, ascarpino
  • Loading branch information
Jamil Nimeh committed Nov 29, 2022
1 parent 33587ff commit cd6bebb
Show file tree
Hide file tree
Showing 28 changed files with 1,590 additions and 38 deletions.
95 changes: 95 additions & 0 deletions src/hotspot/cpu/aarch64/assembler_aarch64.hpp
Expand Up @@ -2322,6 +2322,40 @@ void mvnw(Register Rd, Register Rm,
}
}

// Single-structure load/store method (all addressing variants)
void ld_st(FloatRegister Vt, SIMD_RegVariant T, int index, Address a,
int op1, int op2, int regs) {
int expectedImmediate = (regVariant_to_elemBits(T) >> 3) * regs;
int sVal = (T < D) ? (index >> (2 - T)) & 0x01 : 0;
int opcode = (T < D) ? (T << 2) : ((T & 0x02) << 2);
int size = (T < D) ? (index & (0x3 << T)) : 1; // only care about low 2b
Register Xn = a.base();
int Rm;

switch (a.getMode()) {
case Address::base_plus_offset:
guarantee(a.offset() == 0, "no offset allowed here");
Rm = 0;
break;
case Address::post:
guarantee(a.offset() == expectedImmediate, "bad offset");
op1 |= 0b100;
Rm = 0b11111;
break;
case Address::post_reg:
op1 |= 0b100;
Rm = a.index()->encoding();
break;
default:
ShouldNotReachHere();
}

starti;
f(0,31), f((index >> (3 - T)), 30);
f(op1, 29, 21), f(Rm, 20, 16), f(op2 | opcode | sVal, 15, 12);
f(size, 11, 10), srf(Xn, 5), rf(Vt, 0);
}

public:

#define INSN1(NAME, op1, op2) \
Expand Down Expand Up @@ -2379,6 +2413,66 @@ void mvnw(Register Rd, Register Rm,
#undef INSN3
#undef INSN4

// Handle common single-structure ld/st parameter sanity checks
// for all variations (1 to 4) of SIMD reigster inputs. This
// method will call the routine that generates the opcode.
template<typename R, typename... Rx>
void ldst_sstr(SIMD_RegVariant T, int index, const Address &a,
int op1, int op2, R firstReg, Rx... otherRegs) {
const FloatRegister vtSet[] = { firstReg, otherRegs... };
const int regCount = sizeof...(otherRegs) + 1;
assert(index >= 0 && (T <= D) && ((T == B && index <= 15) ||
(T == H && index <= 7) || (T == S && index <= 3) ||
(T == D && index <= 1)), "invalid index");
assert(regCount >= 1 && regCount <= 4, "illegal register count");

// Check to make sure when multiple SIMD registers are used
// that they are in successive order.
for (int i = 0; i < regCount - 1; i++) {
assert(vtSet[i]->successor() == vtSet[i + 1],
"Registers must be ordered");
}

ld_st(firstReg, T, index, a, op1, op2, regCount);
}

// Define a set of INSN1/2/3/4 macros to handle single-structure
// load/store instructions.
#define INSN1(NAME, op1, op2) \
void NAME(FloatRegister Vt, SIMD_RegVariant T, int index, \
const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt); \
}

#define INSN2(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_RegVariant T, \
int index, const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2); \
}

#define INSN3(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
SIMD_RegVariant T, int index, const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3); \
}

#define INSN4(NAME, op1, op2) \
void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3, \
FloatRegister Vt4, SIMD_RegVariant T, int index, \
const Address &a) { \
ldst_sstr(T, index, a, op1, op2, Vt, Vt2, Vt3, Vt4); \
}

INSN1(st1, 0b001101000, 0b0000);
INSN2(st2, 0b001101001, 0b0000);
INSN3(st3, 0b001101000, 0b0010);
INSN4(st4, 0b001101001, 0b0010);

#undef INSN1
#undef INSN2
#undef INSN3
#undef INSN4

#define INSN(NAME, opc) \
void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
starti; \
Expand Down Expand Up @@ -2749,6 +2843,7 @@ void mvnw(Register Rd, Register Rm,
INSN(ushr, 1, 0b000001, /* isSHR = */ true);
INSN(usra, 1, 0b000101, /* isSHR = */ true);
INSN(ssra, 0, 0b000101, /* isSHR = */ true);
INSN(sli, 1, 0b010101, /* isSHR = */ false);

#undef INSN

Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
Expand Up @@ -1450,6 +1450,13 @@ class MacroAssembler: public Assembler {
void aesecb_decrypt(Register from, Register to, Register key, Register keylen);
void aes_round(FloatRegister input, FloatRegister subkey);

// ChaCha20 functions support block
void cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
FloatRegister tbl);
void cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
FloatRegister dVec, bool colToDiag);

// Place an ISB after code may have been modified due to a safepoint.
void safepoint_isb();

Expand Down
90 changes: 90 additions & 0 deletions src/hotspot/cpu/aarch64/macroAssembler_aarch64_chacha.cpp
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#include "precompiled.hpp"

#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "macroAssembler_aarch64.hpp"
#include "memory/resourceArea.hpp"
#include "runtime/stubRoutines.hpp"

/**
* Perform the quarter round calculations on values contained within
* four SIMD registers.
*
* @param aVec the SIMD register containing only the "a" values
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param scratch scratch SIMD register used for 12 and 7 bit left rotations
* @param table the SIMD register used as a table for 8 bit left rotations
*/
void MacroAssembler::cc20_quarter_round(FloatRegister aVec, FloatRegister bVec,
FloatRegister cVec, FloatRegister dVec, FloatRegister scratch,
FloatRegister table) {

// a += b, d ^= a, d <<<= 16
addv(aVec, T4S, aVec, bVec);
eor(dVec, T16B, dVec, aVec);
rev32(dVec, T8H, dVec);

// c += d, b ^= c, b <<<= 12
addv(cVec, T4S, cVec, dVec);
eor(scratch, T16B, bVec, cVec);
ushr(bVec, T4S, scratch, 20);
sli(bVec, T4S, scratch, 12);

// a += b, d ^= a, d <<<= 8
addv(aVec, T4S, aVec, bVec);
eor(dVec, T16B, dVec, aVec);
tbl(dVec, T16B, dVec, 1, table);

// c += d, b ^= c, b <<<= 7
addv(cVec, T4S, cVec, dVec);
eor(scratch, T16B, bVec, cVec);
ushr(bVec, T4S, scratch, 25);
sli(bVec, T4S, scratch, 7);
}

/**
* Shift the b, c, and d vectors between columnar and diagonal representations.
* Note that the "a" vector does not shift.
*
* @param bVec the SIMD register containing only the "b" values
* @param cVec the SIMD register containing only the "c" values
* @param dVec the SIMD register containing only the "d" values
* @param colToDiag true if moving columnar to diagonal, false if
* moving diagonal back to columnar.
*/
void MacroAssembler::cc20_shift_lane_org(FloatRegister bVec, FloatRegister cVec,
FloatRegister dVec, bool colToDiag) {
int bShift = colToDiag ? 4 : 12;
int cShift = 8;
int dShift = colToDiag ? 12 : 4;

ext(bVec, T16B, bVec, bVec, bShift);
ext(cVec, T16B, cVec, cVec, cShift);
ext(dVec, T16B, dVec, dVec, dShift);
}
130 changes: 130 additions & 0 deletions src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
Expand Up @@ -4081,6 +4081,132 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

// ChaCha20 block function. This version parallelizes by loading
// individual 32-bit state elements into vectors for four blocks
// (e.g. all four blocks' worth of state[0] in one register, etc.)
//
// state (int[16]) = c_rarg0
// keystream (byte[1024]) = c_rarg1
// return - number of bytes of keystream (always 256)
address generate_chacha20Block_blockpar() {
Label L_twoRounds, L_cc20_const;
// The constant data is broken into two 128-bit segments to be loaded
// onto FloatRegisters. The first 128 bits are a counter add overlay
// that adds +0/+1/+2/+3 to the vector holding replicated state[12].
// The second 128-bits is a table constant used for 8-bit left rotations.
__ BIND(L_cc20_const);
__ emit_int64(0x0000000100000000UL);
__ emit_int64(0x0000000300000002UL);
__ emit_int64(0x0605040702010003UL);
__ emit_int64(0x0E0D0C0F0A09080BUL);

__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "chacha20Block");
address start = __ pc();
__ enter();

int i, j;
const Register state = c_rarg0;
const Register keystream = c_rarg1;
const Register loopCtr = r10;
const Register tmpAddr = r11;

const FloatRegister stateFirst = v0;
const FloatRegister stateSecond = v1;
const FloatRegister stateThird = v2;
const FloatRegister stateFourth = v3;
const FloatRegister origCtrState = v28;
const FloatRegister scratch = v29;
const FloatRegister lrot8Tbl = v30;

// Organize SIMD registers in an array that facilitates
// putting repetitive opcodes into loop structures. It is
// important that each grouping of 4 registers is monotonically
// increasing to support the requirements of multi-register
// instructions (e.g. ld4r, st4, etc.)
const FloatRegister workSt[16] = {
v4, v5, v6, v7, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27
};

// Load from memory and interlace across 16 SIMD registers,
// With each word from memory being broadcast to all lanes of
// each successive SIMD register.
// Addr(0) -> All lanes in workSt[i]
// Addr(4) -> All lanes workSt[i + 1], etc.
__ mov(tmpAddr, state);
for (i = 0; i < 16; i += 4) {
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
__ post(tmpAddr, 16));
}

// Pull in constant data. The first 16 bytes are the add overlay
// which is applied to the vector holding the counter (state[12]).
// The second 16 bytes is the index register for the 8-bit left
// rotation tbl instruction.
__ adr(tmpAddr, L_cc20_const);
__ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
__ addv(workSt[12], __ T4S, workSt[12], origCtrState);

// Set up the 10 iteration loop and perform all 8 quarter round ops
__ mov(loopCtr, 10);
__ BIND(L_twoRounds);

__ cc20_quarter_round(workSt[0], workSt[4], workSt[8], workSt[12],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[1], workSt[5], workSt[9], workSt[13],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[2], workSt[6], workSt[10], workSt[14],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[3], workSt[7], workSt[11], workSt[15],
scratch, lrot8Tbl);

__ cc20_quarter_round(workSt[0], workSt[5], workSt[10], workSt[15],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[1], workSt[6], workSt[11], workSt[12],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[2], workSt[7], workSt[8], workSt[13],
scratch, lrot8Tbl);
__ cc20_quarter_round(workSt[3], workSt[4], workSt[9], workSt[14],
scratch, lrot8Tbl);

// Decrement and iterate
__ sub(loopCtr, loopCtr, 1);
__ cbnz(loopCtr, L_twoRounds);

__ mov(tmpAddr, state);

// Add the starting state back to the post-loop keystream
// state. We read/interlace the state array from memory into
// 4 registers similar to what we did in the beginning. Then
// add the counter overlay onto workSt[12] at the end.
for (i = 0; i < 16; i += 4) {
__ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
__ post(tmpAddr, 16));
__ addv(workSt[i], __ T4S, workSt[i], stateFirst);
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
}
__ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask

// Write to key stream, storing the same element out of workSt[0..15]
// to consecutive 4-byte offsets in the key stream buffer, then repeating
// for the next element position.
for (i = 0; i < 4; i++) {
for (j = 0; j < 16; j += 4) {
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
__ post(keystream, 16));
}
}

__ mov(r0, 256); // Return length of output keystream
__ leave();
__ ret(lr);

return start;
}

/**
* Arguments:
*
Expand Down Expand Up @@ -7919,6 +8045,10 @@ class StubGenerator: public StubCodeGenerator {
}
#endif // COMPILER2

if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
}

if (UseBASE64Intrinsics) {
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
Expand Down
11 changes: 11 additions & 0 deletions src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
Expand Up @@ -366,6 +366,17 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}

if (_features & CPU_ASIMD) {
if (FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
UseChaCha20Intrinsics = true;
}
} else if (UseChaCha20Intrinsics) {
if (!FLAG_IS_DEFAULT(UseChaCha20Intrinsics)) {
warning("ChaCha20 intrinsic requires ASIMD instructions");
}
FLAG_SET_DEFAULT(UseChaCha20Intrinsics, false);
}

if (FLAG_IS_DEFAULT(UseBASE64Intrinsics)) {
UseBASE64Intrinsics = true;
}
Expand Down

1 comment on commit cd6bebb

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.