Skip to content

Commit f71f9dc

Browse files
Dong BoRealFYang
authored andcommitted
8255949: AArch64: Add support for vectorized shift right and accumulate
Reviewed-by: aph
1 parent 1332ba3 commit f71f9dc

File tree

3 files changed

+349
-0
lines changed

3 files changed

+349
-0
lines changed

src/hotspot/cpu/aarch64/aarch64.ad

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18922,6 +18922,216 @@ instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
1892218922
ins_pipe(vshift128_imm);
1892318923
%}
1892418924

18925+
instruct vsraa8B_imm(vecD dst, vecD src, immI shift) %{
18926+
predicate(n->as_Vector()->length() == 8);
18927+
match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
18928+
ins_cost(INSN_COST);
18929+
format %{ "ssra $dst, $src, $shift\t# vector (8B)" %}
18930+
ins_encode %{
18931+
int sh = (int)$shift$$constant;
18932+
if (sh >= 8) sh = 7;
18933+
__ ssra(as_FloatRegister($dst$$reg), __ T8B,
18934+
as_FloatRegister($src$$reg), sh);
18935+
%}
18936+
ins_pipe(vshift64_imm);
18937+
%}
18938+
18939+
instruct vsraa16B_imm(vecX dst, vecX src, immI shift) %{
18940+
predicate(n->as_Vector()->length() == 16);
18941+
match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
18942+
ins_cost(INSN_COST);
18943+
format %{ "ssra $dst, $src, $shift\t# vector (16B)" %}
18944+
ins_encode %{
18945+
int sh = (int)$shift$$constant;
18946+
if (sh >= 8) sh = 7;
18947+
__ ssra(as_FloatRegister($dst$$reg), __ T16B,
18948+
as_FloatRegister($src$$reg), sh);
18949+
%}
18950+
ins_pipe(vshift128_imm);
18951+
%}
18952+
18953+
instruct vsraa4S_imm(vecD dst, vecD src, immI shift) %{
18954+
predicate(n->as_Vector()->length() == 4);
18955+
match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
18956+
ins_cost(INSN_COST);
18957+
format %{ "ssra $dst, $src, $shift\t# vector (4H)" %}
18958+
ins_encode %{
18959+
int sh = (int)$shift$$constant;
18960+
if (sh >= 16) sh = 15;
18961+
__ ssra(as_FloatRegister($dst$$reg), __ T4H,
18962+
as_FloatRegister($src$$reg), sh);
18963+
%}
18964+
ins_pipe(vshift64_imm);
18965+
%}
18966+
18967+
instruct vsraa8S_imm(vecX dst, vecX src, immI shift) %{
18968+
predicate(n->as_Vector()->length() == 8);
18969+
match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
18970+
ins_cost(INSN_COST);
18971+
format %{ "ssra $dst, $src, $shift\t# vector (8H)" %}
18972+
ins_encode %{
18973+
int sh = (int)$shift$$constant;
18974+
if (sh >= 16) sh = 15;
18975+
__ ssra(as_FloatRegister($dst$$reg), __ T8H,
18976+
as_FloatRegister($src$$reg), sh);
18977+
%}
18978+
ins_pipe(vshift128_imm);
18979+
%}
18980+
18981+
instruct vsraa2I_imm(vecD dst, vecD src, immI shift) %{
18982+
predicate(n->as_Vector()->length() == 2);
18983+
match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
18984+
ins_cost(INSN_COST);
18985+
format %{ "ssra $dst, $src, $shift\t# vector (2S)" %}
18986+
ins_encode %{
18987+
__ ssra(as_FloatRegister($dst$$reg), __ T2S,
18988+
as_FloatRegister($src$$reg),
18989+
(int)$shift$$constant);
18990+
%}
18991+
ins_pipe(vshift64_imm);
18992+
%}
18993+
18994+
instruct vsraa4I_imm(vecX dst, vecX src, immI shift) %{
18995+
predicate(n->as_Vector()->length() == 4);
18996+
match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
18997+
ins_cost(INSN_COST);
18998+
format %{ "ssra $dst, $src, $shift\t# vector (4S)" %}
18999+
ins_encode %{
19000+
__ ssra(as_FloatRegister($dst$$reg), __ T4S,
19001+
as_FloatRegister($src$$reg),
19002+
(int)$shift$$constant);
19003+
%}
19004+
ins_pipe(vshift128_imm);
19005+
%}
19006+
19007+
instruct vsraa2L_imm(vecX dst, vecX src, immI shift) %{
19008+
predicate(n->as_Vector()->length() == 2);
19009+
match(Set dst (AddVL dst (RShiftVL src (RShiftCntV shift))));
19010+
ins_cost(INSN_COST);
19011+
format %{ "ssra $dst, $src, $shift\t# vector (2D)" %}
19012+
ins_encode %{
19013+
__ ssra(as_FloatRegister($dst$$reg), __ T2D,
19014+
as_FloatRegister($src$$reg),
19015+
(int)$shift$$constant);
19016+
%}
19017+
ins_pipe(vshift128_imm);
19018+
%}
19019+
19020+
instruct vsrla8B_imm(vecD dst, vecD src, immI shift) %{
19021+
predicate(n->as_Vector()->length() == 8);
19022+
match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
19023+
ins_cost(INSN_COST);
19024+
format %{ "usra $dst, $src, $shift\t# vector (8B)" %}
19025+
ins_encode %{
19026+
int sh = (int)$shift$$constant;
19027+
if (sh >= 8) {
19028+
__ eor(as_FloatRegister($src$$reg), __ T8B,
19029+
as_FloatRegister($src$$reg),
19030+
as_FloatRegister($src$$reg));
19031+
} else {
19032+
__ usra(as_FloatRegister($dst$$reg), __ T8B,
19033+
as_FloatRegister($src$$reg), sh);
19034+
}
19035+
%}
19036+
ins_pipe(vshift64_imm);
19037+
%}
19038+
19039+
instruct vsrla16B_imm(vecX dst, vecX src, immI shift) %{
19040+
predicate(n->as_Vector()->length() == 16);
19041+
match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
19042+
ins_cost(INSN_COST);
19043+
format %{ "usra $dst, $src, $shift\t# vector (16B)" %}
19044+
ins_encode %{
19045+
int sh = (int)$shift$$constant;
19046+
if (sh >= 8) {
19047+
__ eor(as_FloatRegister($src$$reg), __ T16B,
19048+
as_FloatRegister($src$$reg),
19049+
as_FloatRegister($src$$reg));
19050+
} else {
19051+
__ usra(as_FloatRegister($dst$$reg), __ T16B,
19052+
as_FloatRegister($src$$reg), sh);
19053+
}
19054+
%}
19055+
ins_pipe(vshift128_imm);
19056+
%}
19057+
19058+
instruct vsrla4S_imm(vecD dst, vecD src, immI shift) %{
19059+
predicate(n->as_Vector()->length() == 4);
19060+
match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
19061+
ins_cost(INSN_COST);
19062+
format %{ "usra $dst, $src, $shift\t# vector (4H)" %}
19063+
ins_encode %{
19064+
int sh = (int)$shift$$constant;
19065+
if (sh >= 16) {
19066+
__ eor(as_FloatRegister($src$$reg), __ T8B,
19067+
as_FloatRegister($src$$reg),
19068+
as_FloatRegister($src$$reg));
19069+
} else {
19070+
__ ushr(as_FloatRegister($dst$$reg), __ T4H,
19071+
as_FloatRegister($src$$reg), sh);
19072+
}
19073+
%}
19074+
ins_pipe(vshift64_imm);
19075+
%}
19076+
19077+
instruct vsrla8S_imm(vecX dst, vecX src, immI shift) %{
19078+
predicate(n->as_Vector()->length() == 8);
19079+
match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
19080+
ins_cost(INSN_COST);
19081+
format %{ "usra $dst, $src, $shift\t# vector (8H)" %}
19082+
ins_encode %{
19083+
int sh = (int)$shift$$constant;
19084+
if (sh >= 16) {
19085+
__ eor(as_FloatRegister($src$$reg), __ T16B,
19086+
as_FloatRegister($src$$reg),
19087+
as_FloatRegister($src$$reg));
19088+
} else {
19089+
__ usra(as_FloatRegister($dst$$reg), __ T8H,
19090+
as_FloatRegister($src$$reg), sh);
19091+
}
19092+
%}
19093+
ins_pipe(vshift128_imm);
19094+
%}
19095+
19096+
instruct vsrla2I_imm(vecD dst, vecD src, immI shift) %{
19097+
predicate(n->as_Vector()->length() == 2);
19098+
match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
19099+
ins_cost(INSN_COST);
19100+
format %{ "usra $dst, $src, $shift\t# vector (2S)" %}
19101+
ins_encode %{
19102+
__ usra(as_FloatRegister($dst$$reg), __ T2S,
19103+
as_FloatRegister($src$$reg),
19104+
(int)$shift$$constant);
19105+
%}
19106+
ins_pipe(vshift64_imm);
19107+
%}
19108+
19109+
instruct vsrla4I_imm(vecX dst, vecX src, immI shift) %{
19110+
predicate(n->as_Vector()->length() == 4);
19111+
match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
19112+
ins_cost(INSN_COST);
19113+
format %{ "usra $dst, $src, $shift\t# vector (4S)" %}
19114+
ins_encode %{
19115+
__ usra(as_FloatRegister($dst$$reg), __ T4S,
19116+
as_FloatRegister($src$$reg),
19117+
(int)$shift$$constant);
19118+
%}
19119+
ins_pipe(vshift128_imm);
19120+
%}
19121+
19122+
instruct vsrla2L_imm(vecX dst, vecX src, immI shift) %{
19123+
predicate(n->as_Vector()->length() == 2);
19124+
match(Set dst (AddVL dst (URShiftVL src (RShiftCntV shift))));
19125+
ins_cost(INSN_COST);
19126+
format %{ "usra $dst, $src, $shift\t# vector (2D)" %}
19127+
ins_encode %{
19128+
__ usra(as_FloatRegister($dst$$reg), __ T2D,
19129+
as_FloatRegister($src$$reg),
19130+
(int)$shift$$constant);
19131+
%}
19132+
ins_pipe(vshift128_imm);
19133+
%}
19134+
1892519135
instruct vmax2F(vecD dst, vecD src1, vecD src2)
1892619136
%{
1892719137
predicate(n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);

src/hotspot/cpu/aarch64/assembler_aarch64.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2688,6 +2688,8 @@ void mvnw(Register Rd, Register Rm,
26882688
INSN(shl, 0, 0b010101, /* isSHR = */ false);
26892689
INSN(sshr, 0, 0b000001, /* isSHR = */ true);
26902690
INSN(ushr, 1, 0b000001, /* isSHR = */ true);
2691+
INSN(usra, 1, 0b000101, /* isSHR = */ true);
2692+
INSN(ssra, 0, 0b000101, /* isSHAR =*/ true);
26912693

26922694
#undef INSN
26932695

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/*
2+
* Copyright (c) 2020, Huawei Technologies Co. Ltd. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
package org.openjdk.bench.vm.compiler;
24+
25+
import org.openjdk.jmh.annotations.*;
26+
import org.openjdk.jmh.infra.*;
27+
28+
import java.util.concurrent.TimeUnit;
29+
import java.util.Random;
30+
31+
@BenchmarkMode(Mode.AverageTime)
32+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
33+
@State(Scope.Thread)
34+
public class VectorShiftAccumulate {
35+
@Param({"1028"})
36+
public int count;
37+
38+
private byte[] bytesA, bytesB, bytesD;
39+
private short[] shortsA, shortsB, shortsD;
40+
private char[] charsA, charsB, charsD;
41+
private int[] intsA, intsB, intsD;
42+
private long[] longsA, longsB, longsD;
43+
44+
@Param("0")
45+
private int seed;
46+
private Random r = new Random(seed);
47+
48+
@Setup
49+
public void init() {
50+
bytesA = new byte[count];
51+
shortsA = new short[count];
52+
charsA = new char[count];
53+
intsA = new int[count];
54+
longsA = new long[count];
55+
56+
bytesB = new byte[count];
57+
shortsB = new short[count];
58+
charsB = new char[count];
59+
intsB = new int[count];
60+
longsB = new long[count];
61+
62+
bytesD = new byte[count];
63+
shortsD = new short[count];
64+
charsD = new char[count];
65+
intsD = new int[count];
66+
longsD = new long[count];
67+
68+
for (int i = 0; i < count; i++) {
69+
bytesA[i] = (byte) r.nextInt();
70+
shortsA[i] = (short) r.nextInt();
71+
intsA[i] = r.nextInt();
72+
longsA[i] = r.nextLong();
73+
74+
bytesB[i] = (byte) r.nextInt();
75+
shortsB[i] = (short) r.nextInt();
76+
intsB[i] = r.nextInt();
77+
longsB[i] = r.nextLong();
78+
}
79+
}
80+
81+
@Benchmark
82+
public void shiftRightAccumulateByte() {
83+
for (int i = 0; i < count; i++) {
84+
bytesD[i] = (byte) (bytesA[i] + (bytesB[i] >> 1));
85+
}
86+
}
87+
88+
@Benchmark
89+
public void shiftURightAccumulateByte() {
90+
for (int i = 0; i < count; i++) {
91+
bytesD[i] = (byte) (bytesA[i] + (((byte) (bytesB[i] >>> 3))));
92+
}
93+
}
94+
95+
@Benchmark
96+
public void shiftRightAccumulateShort() {
97+
for (int i = 0; i < count; i++) {
98+
shortsD[i] = (short) (shortsA[i] + (shortsB[i] >> 5));
99+
}
100+
}
101+
102+
@Benchmark
103+
public void shiftURightAccumulateChar() {
104+
for (int i = 0; i < count; i++) {
105+
charsD[i] = (char) (charsA[i] + (charsB[i] >>> 4));
106+
}
107+
}
108+
109+
@Benchmark
110+
public void shiftRightAccumulateInt() {
111+
for (int i = 0; i < count; i++) {
112+
intsD[i] = intsA[i] + (intsB[i] >> 2);
113+
}
114+
}
115+
116+
@Benchmark
117+
public void shiftURightAccumulateInt() {
118+
for (int i = 0; i < count; i++) {
119+
intsD[i] = (intsB[i] >>> 2) + intsA[i];
120+
}
121+
}
122+
123+
@Benchmark
124+
public void shiftRightAccumulateLong() {
125+
for (int i = 0; i < count; i++) {
126+
longsD[i] = longsA[i] + (longsB[i] >> 5);
127+
}
128+
}
129+
130+
@Benchmark
131+
public void shiftURightAccumulateLong() {
132+
for (int i = 0; i < count; i++) {
133+
longsD[i] = (longsB[i] >>> 2) + longsA[i];
134+
}
135+
}
136+
}
137+

0 commit comments

Comments
 (0)