Skip to content

Commit

Permalink
riscv: GCM: Implement GHASH()
Browse files Browse the repository at this point in the history
RISC-V currently only offers a GMULT() callback for accelerated
processing. Let's implement the missing piece to have GHASH()
available as well. Like GMULT(), we provide a variant for
systems with the Zbkb extension (including brev8).

The integration follows the existing pattern for GMULT()
in RISC-V. We keep the C implementation as we need to decide
if we can call an optimized routine at run-time.
The C implementation is the fall-back in case we don't have
any extensions available that can be used to accelerate
the calculation.

Tested with all combinations of possible extensions
on QEMU (limiting the available instructions accordingly).
No regressions observed.

Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu>

Reviewed-by: Tomas Mraz <tomas@openssl.org>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from #20078)
  • Loading branch information
cmuellner authored and paulidale committed Mar 16, 2023
1 parent b246843 commit f3fed0d
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 8 deletions.
153 changes: 153 additions & 0 deletions crypto/modes/asm/ghash-riscv64.pl
Expand Up @@ -229,6 +229,159 @@
___
}

################################################################################
# void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
# void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
# const u8 *inp, size_t len);
#
# input: Xi: current hash value
# Htable: copy of H
# inp: pointer to input data
# len: length of input data in bytes (mutiple of block size)
# output: Xi: Xi+1 (next hash value Xi)
{
my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");

$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zbc
.type gcm_ghash_rv64i_zbc,\@function
gcm_ghash_rv64i_zbc:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
Lstep:
# Load the input data, bit-reverse them, and XOR them with Xi
ld $t0, 0($inp)
ld $t1, 8($inp)
add $inp, $inp, 16
add $len, $len, -16
@{[brev8_rv64i $t0, $z0, $z1, $z2]}
@{[brev8_rv64i $t1, $z0, $z1, $z2]}
xor $x0, $x0, $t0
xor $x1, $x1, $t1
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Iterate over all blocks
bnez $len, Lstep
# Bit-reverse final Xi back and store it
@{[brev8_rv64i $x0, $z0, $z1, $z2]}
@{[brev8_rv64i $x1, $z0, $z1, $z2]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_ghash_rv64i_zbc,.-gcm_ghash_rv64i_zbc
___
}

{
my ($Xi,$Htable,$inp,$len,$x0,$x1,$y0,$y1) = ("a0","a1","a2","a3","a4","a5","a6","a7");
my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");

$code .= <<___;
.p2align 3
.globl gcm_ghash_rv64i_zbc__zbkb
.type gcm_ghash_rv64i_zbc__zbkb,\@function
gcm_ghash_rv64i_zbc__zbkb:
# Load Xi and bit-reverse it
ld $x0, 0($Xi)
ld $x1, 8($Xi)
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
# Load the key (already bit-reversed)
ld $y0, 0($Htable)
ld $y1, 8($Htable)
# Load the reduction constant
la $polymod, Lpolymod
lbu $polymod, 0($polymod)
Lstep_zkbk:
# Load the input data, bit-reverse them, and XOR them with Xi
ld $t0, 0($inp)
ld $t1, 8($inp)
add $inp, $inp, 16
add $len, $len, -16
@{[brev8 $t0, $t0]}
@{[brev8 $t1, $t1]}
xor $x0, $x0, $t0
xor $x1, $x1, $t1
# Multiplication (without Karatsuba)
@{[clmulh $z3, $x1, $y1]}
@{[clmul $z2, $x1, $y1]}
@{[clmulh $t1, $x0, $y1]}
@{[clmul $z1, $x0, $y1]}
xor $z2, $z2, $t1
@{[clmulh $t1, $x1, $y0]}
@{[clmul $t0, $x1, $y0]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $x0, $y0]}
@{[clmul $z0, $x0, $y0]}
xor $z1, $z1, $t1
# Reduction with clmul
@{[clmulh $t1, $z3, $polymod]}
@{[clmul $t0, $z3, $polymod]}
xor $z2, $z2, $t1
xor $z1, $z1, $t0
@{[clmulh $t1, $z2, $polymod]}
@{[clmul $t0, $z2, $polymod]}
xor $x1, $z1, $t1
xor $x0, $z0, $t0
# Iterate over all blocks
bnez $len, Lstep_zkbk
# Bit-reverse final Xi back and store it
@{[brev8 $x0, $x0]}
@{[brev8 $x1, $x1]}
sd $x0, 0($Xi)
sd $x1, 8($Xi)
ret
.size gcm_ghash_rv64i_zbc__zbkb,.-gcm_ghash_rv64i_zbc__zbkb
___
}

$code .= <<___;
.p2align 3
Lbrev8_const:
Expand Down
25 changes: 17 additions & 8 deletions crypto/modes/gcm128.c
Expand Up @@ -27,9 +27,10 @@ typedef size_t size_t_aX;
# define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
#endif

/* RISC-V uses C implementation of gmult as a fallback. */
/* RISC-V uses C implementation as a fallback. */
#if defined(__riscv)
# define INCLUDE_C_GMULT_4BIT
# define INCLUDE_C_GHASH_4BIT
#endif

#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
Expand Down Expand Up @@ -232,7 +233,7 @@ static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])

# endif

# if !defined(GHASH_ASM)
# if !defined(GHASH_ASM) || defined(INCLUDE_C_GHASH_4BIT)
# if !defined(OPENSSL_SMALL_FOOTPRINT)
/*
* Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
Expand Down Expand Up @@ -401,22 +402,25 @@ void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
size_t len);
# elif defined(OPENSSL_CPUID_OBJ) && defined(__riscv) && __riscv_xlen == 64
# include "crypto/riscv_arch.h"
# define GHASH_ASM_RISCV
# undef GHASH
# define GHASH_ASM_RV64I
/* Zbc/Zbkc (scalar crypto with clmul) based routines. */
void gcm_init_rv64i_zbc(u128 Htable[16], const u64 Xi[2]);
void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 Xi[2]);
void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 Xi[2]);
void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
void gcm_ghash_rv64i_zbc(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
void gcm_ghash_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16],
const u8 *inp, size_t len);
# endif
#endif

static void gcm_get_funcs(struct gcm_funcs_st *ctx)
{
/* set defaults -- overridden below as needed */
ctx->ginit = gcm_init_4bit;
#if !defined(GHASH_ASM) || defined(INCLUDE_C_GMULT_4BIT)
#if !defined(GHASH_ASM)
ctx->gmult = gcm_gmult_4bit;
#else
ctx->gmult = NULL;
Expand Down Expand Up @@ -503,19 +507,24 @@ static void gcm_get_funcs(struct gcm_funcs_st *ctx)
ctx->ghash = gcm_ghash_p8;
}
return;
#elif defined(GHASH_ASM_RISCV) && __riscv_xlen == 64
/* RISCV defaults; gmult already set above */
ctx->ghash = NULL;
#elif defined(GHASH_ASM_RV64I)
/* RISCV defaults */
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;

if (RISCV_HAS_ZBC()) {
if (RISCV_HAS_ZBKB()) {
ctx->ginit = gcm_init_rv64i_zbc__zbkb;
ctx->gmult = gcm_gmult_rv64i_zbc__zbkb;
ctx->ghash = gcm_ghash_rv64i_zbc__zbkb;
} else if (RISCV_HAS_ZBB()) {
ctx->ginit = gcm_init_rv64i_zbc__zbb;
ctx->gmult = gcm_gmult_rv64i_zbc;
ctx->ghash = gcm_ghash_rv64i_zbc;
} else {
ctx->ginit = gcm_init_rv64i_zbc;
ctx->gmult = gcm_gmult_rv64i_zbc;
ctx->ghash = gcm_ghash_rv64i_zbc;
}
}
return;
Expand Down

0 comments on commit f3fed0d

Please sign in to comment.