Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
8269725: AArch64: Add VectorMask query implementation for NEON
Reviewed-by: aph
  • Loading branch information
Xiaohong Gong authored and Ningsheng Jian committed Jul 16, 2021
1 parent 7240d67 commit ea77ef8
Show file tree
Hide file tree
Showing 8 changed files with 427 additions and 101 deletions.
26 changes: 25 additions & 1 deletion src/hotspot/cpu/aarch64/aarch64.ad
Expand Up @@ -1295,7 +1295,31 @@ public:
};
};

bool is_CAS(int opcode, bool maybe_volatile);
static inline BasicType vector_element_basic_type(const MachNode* n) {
const TypeVect* vt = n->bottom_type()->is_vect();
return vt->element_basic_type();
}

static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) {
int def_idx = use->operand_index(opnd);
Node* def = use->in(def_idx);
const TypeVect* vt = def->bottom_type()->is_vect();
return vt->element_basic_type();
}

static inline uint vector_length(const MachNode* n) {
const TypeVect* vt = n->bottom_type()->is_vect();
return vt->length();
}

static inline uint vector_length(const MachNode* use, const MachOper* opnd) {
int def_idx = use->operand_index(opnd);
Node* def = use->in(def_idx);
const TypeVect* vt = def->bottom_type()->is_vect();
return vt->length();
}

bool is_CAS(int opcode, bool maybe_volatile);

// predicates controlling emit of ldr<x>/ldar<x> and associated dmb

Expand Down
169 changes: 169 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64_neon.ad
Expand Up @@ -5296,3 +5296,172 @@ instruct vpopcount2I(vecD dst, vecD src) %{
%}
ins_pipe(pipe_class_default);
%}

// vector mask reductions

instruct vmask_truecount8B(iRegINoSp dst, vecD src, vecD tmp) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskTrueCount src));
effect(TEMP tmp);
ins_cost(2 * INSN_COST);
format %{ "addv $tmp, $src\n\t"
"umov $dst, $tmp, B, 0\t# vector (8B)" %}
ins_encode %{
// Input "src" is a vector of boolean represented as bytes with
// 0x00/0x01 as element values.
__ addv(as_FloatRegister($tmp$$reg), __ T8B, as_FloatRegister($src$$reg));
__ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_truecount16B(iRegINoSp dst, vecX src, vecX tmp) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskTrueCount src));
effect(TEMP tmp);
ins_cost(2 * INSN_COST);
format %{ "addv $tmp, $src\n\t"
"umov $dst, $tmp, B, 0\t# vector (16B)" %}
ins_encode %{
// Input "src" is a vector of boolean represented as bytes with
// 0x00/0x01 as element values.
__ addv(as_FloatRegister($tmp$$reg), __ T16B, as_FloatRegister($src$$reg));
__ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_firsttrue_LT8B(iRegINoSp dst, vecD src, rFlagsReg cr) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
n->in(1)->bottom_type()->is_vect()->length() < 8);
match(Set dst (VectorMaskFirstTrue src));
effect(KILL cr);
ins_cost(7 * INSN_COST);
format %{ "vmask_firsttrue $dst, $src\t# vector (4I/4S/2I)" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by reversing the bits and counting the leading
// zero bytes.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
__ movw(rscratch1, vector_length(this, $src));
__ cmpw($dst$$Register, rscratch1);
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_firsttrue8B(iRegINoSp dst, vecD src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
n->in(1)->bottom_type()->is_vect()->length() == 8);
match(Set dst (VectorMaskFirstTrue src));
ins_cost(4 * INSN_COST);
format %{ "vmask_firsttrue $dst, $src\t# vector (8B)" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by reversing the bits and counting the leading
// zero bytes.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_firsttrue16B(iRegINoSp dst, vecX src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskFirstTrue src));
ins_cost(6 * INSN_COST);
format %{ "vmask_firsttrue $dst, $src\t# vector (16B)" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or 16 (VLENGTH) if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.

Label FIRST_TRUE_INDEX;

// Try to compute the result from lower 64 bits.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, zr);
__ cbnz($dst$$Register, FIRST_TRUE_INDEX);

// Compute the result from the higher 64 bits.
__ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 8);

// Reverse the bits and count the leading zero bytes.
__ bind(FIRST_TRUE_INDEX);
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ addw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_lasttrue8B(iRegINoSp dst, vecD src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskLastTrue src));
ins_cost(4 * INSN_COST);
format %{ "vmask_lasttrue $dst, $src\t# vector (8B)" %}
ins_encode %{
// Returns the index of the last active lane of the
// vector mask, or -1 if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by counting the leading zero bytes and
// substracting it by 7 (VLENGTH - 1).
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ clz($dst$$Register, $dst$$Register);
__ movw(rscratch1, 7);
__ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskLastTrue src));
ins_cost(5 * INSN_COST);
format %{ "vmask_lasttrue $dst, $src\t# vector (16B)" %}
ins_encode %{
// Returns the index of the last active lane of the
// vector mask, or -1 if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.

Label LAST_TRUE_INDEX;

// Try to compute the result from higher 64 bits.
__ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 16 - 1);
__ cbnz($dst$$Register, LAST_TRUE_INDEX);

// Compute the result from the lower 64 bits.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 8 - 1);

// Count the leading zero bytes and substract it by 15 (VLENGTH - 1).
__ bind(LAST_TRUE_INDEX);
__ clz($dst$$Register, $dst$$Register);
__ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}
148 changes: 148 additions & 0 deletions src/hotspot/cpu/aarch64/aarch64_neon_ad.m4
Expand Up @@ -2243,3 +2243,151 @@ instruct vpopcount$1$2`'(vec$5 dst, vec$5 src) %{
dnl $1 $2 $3 $4 $5
VPOPCOUNT(4, I, 16, 8, X)
VPOPCOUNT(2, I, 8, 4, D)
dnl
dnl VMASK_TRUECOUNT($1, $2 )
dnl VMASK_TRUECOUNT(suffix, reg)
define(`VMASK_TRUECOUNT', `
instruct vmask_truecount$1(iRegINoSp dst, $2 src, $2 tmp) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskTrueCount src));
effect(TEMP tmp);
ins_cost(2 * INSN_COST);
format %{ "addv $tmp, $src\n\t"
"umov $dst, $tmp, B, 0\t# vector ($1)" %}
ins_encode %{
// Input "src" is a vector of boolean represented as bytes with
// 0x00/0x01 as element values.
__ addv(as_FloatRegister($tmp$$reg), __ T$1, as_FloatRegister($src$$reg));
__ umov($dst$$Register, as_FloatRegister($tmp$$reg), __ B, 0);
%}
ins_pipe(pipe_slow);
%}')dnl
dnl
dnl
define(`ARGLIST',
`ifelse($1, `_LT8B', `iRegINoSp dst, vecD src, rFlagsReg cr', `iRegINoSp dst, vecD src')')
dnl
dnl VMASK_FIRSTTRUE_D($1, $2, $3, $4 )
dnl VMASK_FIRSTTRUE_D(suffix, cond, cost, size)
define(`VMASK_FIRSTTRUE_D', `
instruct vmask_firsttrue$1(ARGLIST($1)) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN &&
n->in(1)->bottom_type()->is_vect()->length() $2 8);
match(Set dst (VectorMaskFirstTrue src));dnl
ifelse($1, `_LT8B', `
effect(KILL cr);')
ins_cost($3 * INSN_COST);
format %{ "vmask_firsttrue $dst, $src\t# vector ($4)" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or VLENGTH if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by reversing the bits and counting the leading
// zero bytes.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ lsrw($dst$$Register, $dst$$Register, 3);dnl
ifelse(`$1', `_LT8B', `
__ movw(rscratch1, vector_length(this, $src));
__ cmpw($dst$$Register, rscratch1);
__ cselw($dst$$Register, rscratch1, $dst$$Register, Assembler::GE);')
%}
ins_pipe(pipe_slow);
%}')dnl
dnl
undefine(ARGLIST)dnl
dnl
// vector mask reductions
VMASK_TRUECOUNT(8B, vecD)
VMASK_TRUECOUNT(16B, vecX)
VMASK_FIRSTTRUE_D(_LT8B, <, 7, 4I/4S/2I)
VMASK_FIRSTTRUE_D(8B, ==, 4, 8B)

instruct vmask_firsttrue16B(iRegINoSp dst, vecX src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskFirstTrue src));
ins_cost(6 * INSN_COST);
format %{ "vmask_firsttrue $dst, $src\t# vector (16B)" %}
ins_encode %{
// Returns the index of the first active lane of the
// vector mask, or 16 (VLENGTH) if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.

Label FIRST_TRUE_INDEX;

// Try to compute the result from lower 64 bits.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, zr);
__ cbnz($dst$$Register, FIRST_TRUE_INDEX);

// Compute the result from the higher 64 bits.
__ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 8);

// Reverse the bits and count the leading zero bytes.
__ bind(FIRST_TRUE_INDEX);
__ rbit($dst$$Register, $dst$$Register);
__ clz($dst$$Register, $dst$$Register);
__ addw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_lasttrue8B(iRegINoSp dst, vecD src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskLastTrue src));
ins_cost(4 * INSN_COST);
format %{ "vmask_lasttrue $dst, $src\t# vector (8B)" %}
ins_encode %{
// Returns the index of the last active lane of the
// vector mask, or -1 if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.
//
// Computed by counting the leading zero bytes and
// substracting it by 7 (VLENGTH - 1).
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ clz($dst$$Register, $dst$$Register);
__ movw(rscratch1, 7);
__ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}

instruct vmask_lasttrue16B(iRegINoSp dst, vecX src) %{
predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_BOOLEAN);
match(Set dst (VectorMaskLastTrue src));
ins_cost(5 * INSN_COST);
format %{ "vmask_lasttrue $dst, $src\t# vector (16B)" %}
ins_encode %{
// Returns the index of the last active lane of the
// vector mask, or -1 if no lane is active.
//
// Input "src" is a vector of boolean represented as
// bytes with 0x00/0x01 as element values.

Label LAST_TRUE_INDEX;

// Try to compute the result from higher 64 bits.
__ fmovhid($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 16 - 1);
__ cbnz($dst$$Register, LAST_TRUE_INDEX);

// Compute the result from the lower 64 bits.
__ fmovd($dst$$Register, as_FloatRegister($src$$reg));
__ movw(rscratch1, 8 - 1);

// Count the leading zero bytes and substract it by 15 (VLENGTH - 1).
__ bind(LAST_TRUE_INDEX);
__ clz($dst$$Register, $dst$$Register);
__ subw($dst$$Register, rscratch1, $dst$$Register, Assembler::LSR, 3);
%}
ins_pipe(pipe_slow);
%}
15 changes: 3 additions & 12 deletions src/hotspot/cpu/aarch64/aarch64_sve.ad
Expand Up @@ -87,18 +87,6 @@ source_hpp %{
%}

source %{
static inline BasicType vector_element_basic_type(const MachNode* n) {
const TypeVect* vt = n->bottom_type()->is_vect();
return vt->element_basic_type();
}

static inline BasicType vector_element_basic_type(const MachNode* use, const MachOper* opnd) {
int def_idx = use->operand_index(opnd);
Node* def = use->in(def_idx);
const TypeVect* vt = def->bottom_type()->is_vect();
return vt->element_basic_type();
}

static Assembler::SIMD_RegVariant elemBytes_to_regVariant(int esize) {
switch(esize) {
case 1:
Expand Down Expand Up @@ -203,6 +191,9 @@ source %{
case Op_VectorReinterpret:
case Op_VectorStoreMask:
case Op_VectorTest:
case Op_VectorMaskTrueCount:
case Op_VectorMaskLastTrue:
case Op_VectorMaskFirstTrue:
return false;
default:
return true;
Expand Down

1 comment on commit ea77ef8

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.