Skip to content

Commit

Permalink
ec: powerpc64le: Add asm implementation of felem_{square,mul}
Browse files Browse the repository at this point in the history
Add an assembly implementation of felem_{square,mul}, which will be
implemented whenever Altivec support is present and the core implements
ISA 3.0 (Power 9) or greater.

Signed-off-by: Rohan McLure <rohanmclure@linux.ibm.com>

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Shane Lontis <shane.lontis@oracle.com>
Reviewed-by: Dmitry Belyavskiy <beldmit@gmail.com>
Reviewed-by: Todd Short <todd.short@me.com>
(Merged from #21471)
  • Loading branch information
Rohan McLure authored and tmshort committed Aug 4, 2023
1 parent 01d901e commit 966047e
Show file tree
Hide file tree
Showing 3 changed files with 368 additions and 2 deletions.
355 changes: 355 additions & 0 deletions crypto/ec/asm/ecp_nistp384-ppc64.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
#! /usr/bin/env perl
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Rohan McLure <rmclure@linux.ibm.com> for the OpenSSL
# project.
# ====================================================================
#
# p384 lower-level primitives for PPC64 using vector instructions.
#

use strict;
use warnings;

my $flavour = shift;
my $output = "";
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
if (!$output) {
$output = "-";
}

my ($xlate, $dir);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";

open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;

my $code = "";

my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");

my $vzero = "v32";

sub startproc($)
{
my ($name) = @_;

$code.=<<___;
.globl ${name}
.align 5
${name}:
___
}

sub endproc($)
{
my ($name) = @_;

$code.=<<___;
blr
.size ${name},.-${name}
___
}


sub push_vrs($$)
{
my ($min, $max) = @_;

my $count = $max - $min + 1;

$code.=<<___;
mr $savesp,$sp
stdu $sp,-16*`$count+1`($sp)
___
for (my $i = $min; $i <= $max; $i++) {
my $mult = $max - $i + 1;
$code.=<<___;
stxv $i,-16*$mult($savesp)
___

}

$code.=<<___;
___
}

sub pop_vrs($$)
{
my ($min, $max) = @_;

$code.=<<___;
ld $savesp,0($sp)
___
for (my $i = $min; $i <= $max; $i++) {
my $mult = $max - $i + 1;
$code.=<<___;
lxv $i,-16*$mult($savesp)
___
}

$code.=<<___;
mr $sp,$savesp
___
}

sub load_vrs($$)
{
my ($pointer, $reg_list) = @_;

for (my $i = 0; $i <= 6; $i++) {
my $offset = $i * 8;
$code.=<<___;
lxsd $reg_list->[$i],$offset($pointer)
___
}

$code.=<<___;
___
}

sub store_vrs($$)
{
my ($pointer, $reg_list) = @_;

for (my $i = 0; $i <= 12; $i++) {
my $offset = $i * 16;
$code.=<<___;
stxv $reg_list->[$i],$offset($pointer)
___
}

$code.=<<___;
___
}

$code.=<<___;
.machine "any"
.text
___

{
# mul/square common
my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v42", "v43");
my ($zero, $one) = ("r8", "r9");
my $out = "v51";

{
#
# p384_felem_mul
#

my ($in1p, $in2p) = ("r4", "r5");
my @in1 = map("v$_",(44..50));
my @in2 = map("v$_",(35..41));

startproc("p384_felem_mul");

push_vrs(52, 63);

$code.=<<___;
vspltisw $vzero,0
___

load_vrs($in1p, \@in1);
load_vrs($in2p, \@in2);

$code.=<<___;
vmsumudm $out,$in1[0],$in2[0],$vzero
stxv $out,0($outp)
xxpermdi $t1,$in1[0],$in1[1],0b00
xxpermdi $t2,$in2[1],$in2[0],0b00
vmsumudm $out,$t1,$t2,$vzero
stxv $out,16($outp)
xxpermdi $t2,$in2[2],$in2[1],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$in1[2],$in2[0],$out
stxv $out,32($outp)
xxpermdi $t2,$in2[1],$in2[0],0b00
xxpermdi $t3,$in1[2],$in1[3],0b00
xxpermdi $t4,$in2[3],$in2[2],0b00
vmsumudm $out,$t1,$t4,$vzero
vmsumudm $out,$t3,$t2,$out
stxv $out,48($outp)
xxpermdi $t2,$in2[4],$in2[3],0b00
xxpermdi $t4,$in2[2],$in2[1],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$t3,$t4,$out
vmsumudm $out,$in1[4],$in2[0],$out
stxv $out,64($outp)
xxpermdi $t2,$in2[5],$in2[4],0b00
xxpermdi $t4,$in2[3],$in2[2],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$t3,$t4,$out
xxpermdi $t4,$in2[1],$in2[0],0b00
xxpermdi $t1,$in1[4],$in1[5],0b00
vmsumudm $out,$t1,$t4,$out
stxv $out,80($outp)
xxpermdi $t1,$in1[0],$in1[1],0b00
xxpermdi $t2,$in2[6],$in2[5],0b00
xxpermdi $t4,$in2[4],$in2[3],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$t3,$t4,$out
xxpermdi $t2,$in2[2],$in2[1],0b00
xxpermdi $t1,$in1[4],$in1[5],0b00
vmsumudm $out,$t1,$t2,$out
vmsumudm $out,$in1[6],$in2[0],$out
stxv $out,96($outp)
xxpermdi $t1,$in1[1],$in1[2],0b00
xxpermdi $t2,$in2[6],$in2[5],0b00
xxpermdi $t3,$in1[3],$in1[4],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$t3,$t4,$out
xxpermdi $t3,$in2[2],$in2[1],0b00
xxpermdi $t1,$in1[5],$in1[6],0b00
vmsumudm $out,$t1,$t3,$out
stxv $out,112($outp)
xxpermdi $t1,$in1[2],$in1[3],0b00
xxpermdi $t3,$in1[4],$in1[5],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$t3,$t4,$out
vmsumudm $out,$in1[6],$in2[2],$out
stxv $out,128($outp)
xxpermdi $t1,$in1[3],$in1[4],0b00
vmsumudm $out,$t1,$t2,$vzero
xxpermdi $t1,$in1[5],$in1[6],0b00
vmsumudm $out,$t1,$t4,$out
stxv $out,144($outp)
vmsumudm $out,$t3,$t2,$vzero
vmsumudm $out,$in1[6],$in2[4],$out
stxv $out,160($outp)
vmsumudm $out,$t1,$t2,$vzero
stxv $out,176($outp)
vmsumudm $out,$in1[6],$in2[6],$vzero
stxv $out,192($outp)
___

endproc("p384_felem_mul");
}

{
#
# p384_felem_square
#

my ($inp) = ("r4");
my @in = map("v$_",(44..50));
my @inx2 = map("v$_",(35..41));

startproc("p384_felem_square");

push_vrs(52, 63);

$code.=<<___;
vspltisw $vzero,0
___

load_vrs($inp, \@in);

$code.=<<___;
li $zero,0
li $one,1
mtvsrdd $t1,$one,$zero
___

for (my $i = 0; $i <= 6; $i++) {
$code.=<<___;
vsld $inx2[$i],$in[$i],$t1
___
}

$code.=<<___;
vmsumudm $out,$in[0],$in[0],$vzero
stxv $out,0($outp)
vmsumudm $out,$in[0],$inx2[1],$vzero
stxv $out,16($outp)
vmsumudm $out,$in[0],$inx2[2],$vzero
vmsumudm $out,$in[1],$in[1],$out
stxv $out,32($outp)
xxpermdi $t1,$in[0],$in[1],0b00
xxpermdi $t2,$inx2[3],$inx2[2],0b00
vmsumudm $out,$t1,$t2,$vzero
stxv $out,48($outp)
xxpermdi $t4,$inx2[4],$inx2[3],0b00
vmsumudm $out,$t1,$t4,$vzero
vmsumudm $out,$in[2],$in[2],$out
stxv $out,64($outp)
xxpermdi $t2,$inx2[5],$inx2[4],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$in[2],$inx2[3],$out
stxv $out,80($outp)
xxpermdi $t2,$inx2[6],$inx2[5],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$in[2],$inx2[4],$out
vmsumudm $out,$in[3],$in[3],$out
stxv $out,96($outp)
xxpermdi $t3,$in[1],$in[2],0b00
vmsumudm $out,$t3,$t2,$vzero
vmsumudm $out,$in[3],$inx2[4],$out
stxv $out,112($outp)
xxpermdi $t1,$in[2],$in[3],0b00
vmsumudm $out,$t1,$t2,$vzero
vmsumudm $out,$in[4],$in[4],$out
stxv $out,128($outp)
xxpermdi $t1,$in[3],$in[4],0b00
vmsumudm $out,$t1,$t2,$vzero
stxv $out,144($outp)
vmsumudm $out,$in[4],$inx2[6],$vzero
vmsumudm $out,$in[5],$in[5],$out
stxv $out,160($outp)
vmsumudm $out,$in[5],$inx2[6],$vzero
stxv $out,176($outp)
vmsumudm $out,$in[6],$in[6],$vzero
stxv $out,192($outp)
___

endproc("p384_felem_square");
}
}

$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";
6 changes: 4 additions & 2 deletions crypto/ec/build.info
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ IF[{- !$disabled{asm} -}]
$ECASM_ppc64=ecp_nistz256.c ecp_ppc.c ecp_nistz256-ppc64.s
$ECDEF_ppc64=ECP_NISTZ256_ASM
IF[{- !$disabled{'ec_nistp_64_gcc_128'} -}]
$ECASM_ppc64=$ECASM_ppc64 ecp_nistp521-ppc64.s
$ECDEF_ppc64=$ECDEF_ppc64 ECP_NISTP521_ASM
$ECASM_ppc64=$ECASM_ppc64 ecp_nistp384-ppc64.s ecp_nistp521-ppc64.s
$ECDEF_ppc64=$ECDEF_ppc64 ECP_NISTP384_ASM ECP_NISTP521_ASM
INCLUDE[ecp_nistp384.o]=..
INCLUDE[ecp_nistp521.o]=..
ENDIF
IF[{- !$disabled{'ecx'} -}]
Expand Down Expand Up @@ -119,6 +120,7 @@ GENERATE[ecp_nistz256-armv8.S]=asm/ecp_nistz256-armv8.pl
INCLUDE[ecp_nistz256-armv8.o]=..
GENERATE[ecp_nistz256-ppc64.s]=asm/ecp_nistz256-ppc64.pl

GENERATE[ecp_nistp384-ppc64.s]=asm/ecp_nistp384-ppc64.pl
GENERATE[ecp_nistp521-ppc64.s]=asm/ecp_nistp521-ppc64.pl

IF[{- !$disabled{'ecx'} -}]
Expand Down
9 changes: 9 additions & 0 deletions crypto/ec/ecp_nistp384.c
Original file line number Diff line number Diff line change
Expand Up @@ -691,6 +691,15 @@ void p384_felem_mul(widefelem out, const felem in1, const felem in2);

static void felem_select(void)
{
# if defined(_ARCH_PPC64)
if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
felem_square_p = p384_felem_square;
felem_mul_p = p384_felem_mul;

return;
}
# endif

/* Default */
felem_square_p = felem_square_ref;
felem_mul_p = felem_mul_ref;
Expand Down

0 comments on commit 966047e

Please sign in to comment.