Skip to content

Commit

Permalink
8285013: AArch64: [vectorapi] Backend support of ExpandV for SVE2
Browse files Browse the repository at this point in the history
Reviewed-by: njian
  • Loading branch information
e1iu committed Apr 28, 2022
1 parent 40ac814 commit 35d92ab
Show file tree
Hide file tree
Showing 6 changed files with 198 additions and 107 deletions.
1 change: 1 addition & 0 deletions src/hotspot/cpu/aarch64/aarch64.ad
Expand Up @@ -2484,6 +2484,7 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
case Op_StoreVectorScatter:
case Op_CompressV:
case Op_CompressM:
case Op_ExpandV:
return false;
default:
break;
Expand Down
60 changes: 48 additions & 12 deletions src/hotspot/cpu/aarch64/aarch64_sve.ad
Expand Up @@ -149,6 +149,8 @@ source %{
case Op_LoadVector:
case Op_StoreVector:
return Matcher::vector_size_supported(bt, vlen);
case Op_ExpandV:
if (UseSVE < 2 || is_subword_type(bt)) return false;
default:
break;
}
Expand Down Expand Up @@ -5529,17 +5531,17 @@ instruct vloadmask_extend(pRegGov dst, vReg src, vReg tmp, rFlagsReg cr) %{

// ---------------------------- Compress/Expand Operations ---------------------------

instruct mcompress(pReg dst, pReg mask, rFlagsReg cr) %{
instruct mcompress(pReg dst, pReg pg, rFlagsReg cr) %{
predicate(UseSVE > 0);
match(Set dst (CompressM mask));
match(Set dst (CompressM pg));
effect(KILL cr);
ins_cost(2 * SVE_COST);
format %{ "sve_cntp rscratch1, $mask\n\t"
format %{ "sve_cntp rscratch1, $pg\n\t"
"sve_whilelo $dst, zr, rscratch1\t# mask compress (B/H/S/D)" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_cntp(rscratch1, size, ptrue, as_PRegister($mask$$reg));
__ sve_cntp(rscratch1, size, ptrue, as_PRegister($pg$$reg));
__ sve_whilelo(as_PRegister($dst$$reg), size, zr, rscratch1);
%}
ins_pipe(pipe_slow);
Expand All @@ -5562,35 +5564,69 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vcompressB(vReg dst, vReg src, pReg mask, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
pReg ptmp, pRegGov pgtmp) %{
predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP ptmp, TEMP pgtmp);
match(Set dst (CompressV src mask));
match(Set dst (CompressV src pg));
ins_cost(13 * SVE_COST);
format %{ "sve_compact $dst, $src, $mask\t# vector compress (B)" %}
format %{ "sve_compact $dst, $src, $pg\t# vector compress (B)" %}
ins_encode %{
__ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($mask$$reg),
__ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg),
as_FloatRegister($vtmp3$$reg),as_FloatRegister($vtmp4$$reg),
as_PRegister($ptmp$$reg), as_PRegister($pgtmp$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vcompressS(vReg dst, vReg src, pReg mask, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP pgtmp);
match(Set dst (CompressV src mask));
match(Set dst (CompressV src pg));
ins_cost(38 * SVE_COST);
format %{ "sve_compact $dst, $src, $mask\t# vector compress (H)" %}
format %{ "sve_compact $dst, $src, $pg\t# vector compress (H)" %}
ins_encode %{
__ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($mask$$reg),
__ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg), as_PRegister($pgtmp$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
match(Set dst (ExpandV src pg));
effect(TEMP_DEF dst);
ins_cost(4 * SVE_COST);
format %{ "sve_dup $dst, S/D, 0\n\t"
"sve_histcnt $dst, S/D, $pg, $dst, $dst\n\t"
"sve_sub $dst, S/D, 1\n\t"
"sve_tbl $dst, S/D, $src, $dst\t# vector expand (S/D)" %}
ins_encode %{
// Example input: src = 1 2 3 4 5 6 7 8
// pg = 1 0 0 1 1 0 1 1
// Expected result: dst = 4 0 0 5 6 0 7 8

// The basic idea is to use TBL which can shuffle the elements in the given
// vector flexibly. HISTCNT + SUB is used to generate the second source input
// for TBL whose value is used to select the indexed element from src vector.

BasicType bt = Matcher::vector_element_basic_type(this);
assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
// dst = 0 0 0 0 0 0 0 0
__ sve_dup(as_FloatRegister($dst$$reg), size, 0);
// dst = 5 0 0 4 3 0 2 1
__ sve_histcnt(as_FloatRegister($dst$$reg), size, as_PRegister($pg$$reg),
as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg));
// dst = 4 -1 -1 3 2 -1 1 0
__ sve_sub(as_FloatRegister($dst$$reg), size, 1);
// dst = 4 0 0 5 6 0 7 8
__ sve_tbl(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg),
as_FloatRegister($dst$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
predicate(UseSVE > 0);
match(Set pg (VectorMaskGen len));
Expand Down
60 changes: 48 additions & 12 deletions src/hotspot/cpu/aarch64/aarch64_sve_ad.m4
Expand Up @@ -144,6 +144,8 @@ source %{
case Op_LoadVector:
case Op_StoreVector:
return Matcher::vector_size_supported(bt, vlen);
case Op_ExpandV:
if (UseSVE < 2 || is_subword_type(bt)) return false;
default:
break;
}
Expand Down Expand Up @@ -3068,17 +3070,17 @@ instruct vloadmask_extend(pRegGov dst, vReg src, vReg tmp, rFlagsReg cr) %{

// ---------------------------- Compress/Expand Operations ---------------------------

instruct mcompress(pReg dst, pReg mask, rFlagsReg cr) %{
instruct mcompress(pReg dst, pReg pg, rFlagsReg cr) %{
predicate(UseSVE > 0);
match(Set dst (CompressM mask));
match(Set dst (CompressM pg));
effect(KILL cr);
ins_cost(2 * SVE_COST);
format %{ "sve_cntp rscratch1, $mask\n\t"
format %{ "sve_cntp rscratch1, $pg\n\t"
"sve_whilelo $dst, zr, rscratch1\t# mask compress (B/H/S/D)" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
__ sve_cntp(rscratch1, size, ptrue, as_PRegister($mask$$reg));
__ sve_cntp(rscratch1, size, ptrue, as_PRegister($pg$$reg));
__ sve_whilelo(as_PRegister($dst$$reg), size, zr, rscratch1);
%}
ins_pipe(pipe_slow);
Expand All @@ -3101,35 +3103,69 @@ instruct vcompress(vReg dst, vReg src, pRegGov pg) %{
ins_pipe(pipe_slow);
%}

instruct vcompressB(vReg dst, vReg src, pReg mask, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
instruct vcompressB(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, vReg vtmp3, vReg vtmp4,
pReg ptmp, pRegGov pgtmp) %{
predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_BYTE);
effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP ptmp, TEMP pgtmp);
match(Set dst (CompressV src mask));
match(Set dst (CompressV src pg));
ins_cost(13 * SVE_COST);
format %{ "sve_compact $dst, $src, $mask\t# vector compress (B)" %}
format %{ "sve_compact $dst, $src, $pg\t# vector compress (B)" %}
ins_encode %{
__ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($mask$$reg),
__ sve_compress_byte(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg),
as_FloatRegister($vtmp3$$reg),as_FloatRegister($vtmp4$$reg),
as_PRegister($ptmp$$reg), as_PRegister($pgtmp$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vcompressS(vReg dst, vReg src, pReg mask, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
instruct vcompressS(vReg dst, vReg src, pReg pg, vReg vtmp1, vReg vtmp2, pRegGov pgtmp) %{
predicate(UseSVE > 0 && n->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
effect(TEMP_DEF dst, TEMP vtmp1, TEMP vtmp2, TEMP pgtmp);
match(Set dst (CompressV src mask));
match(Set dst (CompressV src pg));
ins_cost(38 * SVE_COST);
format %{ "sve_compact $dst, $src, $mask\t# vector compress (H)" %}
format %{ "sve_compact $dst, $src, $pg\t# vector compress (H)" %}
ins_encode %{
__ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($mask$$reg),
__ sve_compress_short(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg), as_PRegister($pg$$reg),
as_FloatRegister($vtmp1$$reg),as_FloatRegister($vtmp2$$reg), as_PRegister($pgtmp$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vexpand(vReg dst, vReg src, pRegGov pg) %{
match(Set dst (ExpandV src pg));
effect(TEMP_DEF dst);
ins_cost(4 * SVE_COST);
format %{ "sve_dup $dst, S/D, 0\n\t"
"sve_histcnt $dst, S/D, $pg, $dst, $dst\n\t"
"sve_sub $dst, S/D, 1\n\t"
"sve_tbl $dst, S/D, $src, $dst\t# vector expand (S/D)" %}
ins_encode %{
// Example input: src = 1 2 3 4 5 6 7 8
// pg = 1 0 0 1 1 0 1 1
// Expected result: dst = 4 0 0 5 6 0 7 8

// The basic idea is to use TBL which can shuffle the elements in the given
// vector flexibly. HISTCNT + SUB is used to generate the second source input
// for TBL whose value is used to select the indexed element from src vector.

BasicType bt = Matcher::vector_element_basic_type(this);
assert(UseSVE == 2 && !is_subword_type(bt), "unsupported");
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
// dst = 0 0 0 0 0 0 0 0
__ sve_dup(as_FloatRegister($dst$$reg), size, 0);
// dst = 5 0 0 4 3 0 2 1
__ sve_histcnt(as_FloatRegister($dst$$reg), size, as_PRegister($pg$$reg),
as_FloatRegister($dst$$reg), as_FloatRegister($dst$$reg));
// dst = 4 -1 -1 3 2 -1 1 0
__ sve_sub(as_FloatRegister($dst$$reg), size, 1);
// dst = 4 0 0 5 6 0 7 8
__ sve_tbl(as_FloatRegister($dst$$reg), size, as_FloatRegister($src$$reg),
as_FloatRegister($dst$$reg));
%}
ins_pipe(pipe_slow);
%}

instruct vmask_gen(pRegGov pg, iRegL len, rFlagsReg cr) %{
predicate(UseSVE > 0);
match(Set pg (VectorMaskGen len));
Expand Down
9 changes: 9 additions & 0 deletions src/hotspot/cpu/aarch64/assembler_aarch64.hpp
Expand Up @@ -3783,6 +3783,15 @@ void sve_cmp(Condition cond, PRegister Pd, SIMD_RegVariant T,
pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}

// SVE2 Count matching elements in vector
void sve_histcnt(FloatRegister Zd, SIMD_RegVariant T, PRegister Pg,
FloatRegister Zn, FloatRegister Zm) {
starti;
assert(T == S || T == D, "invalid size");
f(0b01000101, 31, 24), f(T, 23, 22), f(0b1, 21), rf(Zm, 16);
f(0b110, 15, 13), pgrf(Pg, 10), rf(Zn, 5), rf(Zd, 0);
}

Assembler(CodeBuffer* code) : AbstractAssembler(code) {
}

Expand Down
10 changes: 8 additions & 2 deletions test/hotspot/gtest/aarch64/aarch64-asmtest.py
Expand Up @@ -1786,6 +1786,9 @@ def generate(kind, names):
["punpkhi", "__ sve_punpkhi(p1, p0);", "punpkhi\tp1.h, p0.b"],
["compact", "__ sve_compact(z16, __ S, z16, p1);", "compact\tz16.s, p1, z16.s"],
["compact", "__ sve_compact(z16, __ D, z16, p1);", "compact\tz16.d, p1, z16.d"],
# SVE2 instructions
["histcnt", "__ sve_histcnt(z16, __ S, p0, z16, z16);", "histcnt\tz16.s, p0/z, z16.s, z16.s"],
["histcnt", "__ sve_histcnt(z17, __ D, p0, z17, z17);", "histcnt\tz17.d, p0/z, z17.d, z17.d"],
])

print "\n// FloatImmediateOp"
Expand Down Expand Up @@ -1871,6 +1874,8 @@ def generate(kind, names):
["bic", "ZZZ"],
["uzp1", "ZZZ"],
["uzp2", "ZZZ"],
# SVE2 instructions
["histcnt", "ZPZZ", "z"],
])

generate(SVEReductionOp, [["andv", 0], ["orv", 0], ["eorv", 0], ["smaxv", 0], ["sminv", 0],
Expand All @@ -1881,8 +1886,9 @@ def generate(kind, names):

outfile.close()

# compile for sve with 8.3 and sha3 because of SHA3 crypto extension.
subprocess.check_call([AARCH64_AS, "-march=armv8.3-a+sha3+sve", "aarch64ops.s", "-o", "aarch64ops.o"])
# compile for sve with armv9-a+sha3 because of SHA3 crypto extension and SVE2 instructions.
# armv9-a enables sve and sve2 by default.
subprocess.check_call([AARCH64_AS, "-march=armv9-a+sha3", "aarch64ops.s", "-o", "aarch64ops.o"])

print
print "/*"
Expand Down

0 comments on commit 35d92ab

Please sign in to comment.