8255949: AArch64: Add support for vectorized shift right and accumulate

Dong Bo · RealFYang · commit f71f9dc93a69 · 2020-11-10T01:24:25.000Z
Reviewed-by: aph
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -18922,6 +18922,216 @@ instruct vsrl2L_imm(vecX dst, vecX src, immI shift) %{
   ins_pipe(vshift128_imm);
 %}
 
+instruct vsraa8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 8) sh = 7;
+    __ ssra(as_FloatRegister($dst$$reg), __ T8B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsraa16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB dst (RShiftVB src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 8) sh = 7;
+    __ ssra(as_FloatRegister($dst$$reg), __ T16B,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsraa4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 16) sh = 15;
+    __ ssra(as_FloatRegister($dst$$reg), __ T4H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsraa8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS dst (RShiftVS src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 16) sh = 15;
+    __ ssra(as_FloatRegister($dst$$reg), __ T8H,
+           as_FloatRegister($src$$reg), sh);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsraa2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ ssra(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsraa4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI dst (RShiftVI src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ ssra(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsraa2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL dst (RShiftVL src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "ssra    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ ssra(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsrla8B_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (8B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($src$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ usra(as_FloatRegister($dst$$reg), __ T8B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsrla16B_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB dst (URShiftVB src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (16B)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 8) {
+      __ eor(as_FloatRegister($src$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ usra(as_FloatRegister($dst$$reg), __ T16B,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsrla4S_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (4H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($src$$reg), __ T8B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ ushr(as_FloatRegister($dst$$reg), __ T4H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsrla8S_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS dst (URShiftVS src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (8H)" %}
+  ins_encode %{
+    int sh = (int)$shift$$constant;
+    if (sh >= 16) {
+      __ eor(as_FloatRegister($src$$reg), __ T16B,
+             as_FloatRegister($src$$reg),
+             as_FloatRegister($src$$reg));
+    } else {
+      __ usra(as_FloatRegister($dst$$reg), __ T8H,
+             as_FloatRegister($src$$reg), sh);
+    }
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsrla2I_imm(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (2S)" %}
+  ins_encode %{
+    __ usra(as_FloatRegister($dst$$reg), __ T2S,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift64_imm);
+%}
+
+instruct vsrla4I_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI dst (URShiftVI src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (4S)" %}
+  ins_encode %{
+    __ usra(as_FloatRegister($dst$$reg), __ T4S,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
+instruct vsrla2L_imm(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL dst (URShiftVL src (RShiftCntV shift))));
+  ins_cost(INSN_COST);
+  format %{ "usra    $dst, $src, $shift\t# vector (2D)" %}
+  ins_encode %{
+    __ usra(as_FloatRegister($dst$$reg), __ T2D,
+            as_FloatRegister($src$$reg),
+            (int)$shift$$constant);
+  %}
+  ins_pipe(vshift128_imm);
+%}
+
 instruct vmax2F(vecD dst, vecD src1, vecD src2)
 %{
   predicate(n->as_Vector()->length() == 2 && n->bottom_type()->is_vect()->element_basic_type() == T_FLOAT);
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -2688,6 +2688,8 @@ void mvnw(Register Rd, Register Rm,
   INSN(shl,  0, 0b010101, /* isSHR = */ false);
   INSN(sshr, 0, 0b000001, /* isSHR = */ true);
   INSN(ushr, 1, 0b000001, /* isSHR = */ true);
+  INSN(usra, 1, 0b000101, /* isSHR = */ true);
+  INSN(ssra, 0, 0b000101, /* isSHAR =*/ true);
 
 #undef INSN
 
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorShiftAccumulate.java b/test/micro/org/openjdk/bench/vm/compiler/VectorShiftAccumulate.java
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2020, Huawei Technologies Co. Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.*;
+
+import java.util.concurrent.TimeUnit;
+import java.util.Random;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public class VectorShiftAccumulate {
+    @Param({"1028"})
+    public int count;
+
+    private byte[]  bytesA,  bytesB,  bytesD;
+    private short[] shortsA, shortsB, shortsD;
+    private char[]  charsA,  charsB,  charsD;
+    private int[]   intsA,   intsB,   intsD;
+    private long[]  longsA,  longsB,  longsD;
+
+    @Param("0")
+    private int seed;
+    private Random r = new Random(seed);
+
+    @Setup
+    public void init() {
+        bytesA  = new byte[count];
+        shortsA = new short[count];
+        charsA  = new char[count];
+        intsA   = new int[count];
+        longsA  = new long[count];
+
+        bytesB  = new byte[count];
+        shortsB = new short[count];
+        charsB  = new char[count];
+        intsB   = new int[count];
+        longsB  = new long[count];
+
+        bytesD  = new byte[count];
+        shortsD = new short[count];
+        charsD  = new char[count];
+        intsD   = new int[count];
+        longsD  = new long[count];
+
+        for (int i = 0; i < count; i++) {
+            bytesA[i]  = (byte) r.nextInt();
+            shortsA[i] = (short) r.nextInt();
+            intsA[i]   = r.nextInt();
+            longsA[i]  = r.nextLong();
+
+            bytesB[i]  = (byte) r.nextInt();
+            shortsB[i] = (short) r.nextInt();
+            intsB[i]   = r.nextInt();
+            longsB[i]  = r.nextLong();
+        }
+    }
+
+    @Benchmark
+    public void shiftRightAccumulateByte() {
+        for (int i = 0; i < count; i++) {
+            bytesD[i] = (byte) (bytesA[i] + (bytesB[i] >> 1));
+        }
+    }
+
+    @Benchmark
+    public void shiftURightAccumulateByte() {
+        for (int i = 0; i < count; i++) {
+            bytesD[i] = (byte) (bytesA[i] + (((byte) (bytesB[i] >>> 3))));
+        }
+    }
+
+    @Benchmark
+    public void shiftRightAccumulateShort() {
+        for (int i = 0; i < count; i++) {
+            shortsD[i] = (short) (shortsA[i] + (shortsB[i] >> 5));
+        }
+    }
+
+    @Benchmark
+    public void shiftURightAccumulateChar() {
+        for (int i = 0; i < count; i++) {
+            charsD[i] = (char) (charsA[i] + (charsB[i] >>> 4));
+        }
+    }
+
+    @Benchmark
+    public void shiftRightAccumulateInt() {
+        for (int i = 0; i < count; i++) {
+            intsD[i] = intsA[i] + (intsB[i] >> 2);
+        }
+    }
+
+    @Benchmark
+    public void shiftURightAccumulateInt() {
+        for (int i = 0; i < count; i++) {
+            intsD[i] = (intsB[i] >>> 2) + intsA[i];
+        }
+    }
+
+    @Benchmark
+    public void shiftRightAccumulateLong() {
+        for (int i = 0; i < count; i++) {
+            longsD[i] = longsA[i] + (longsB[i] >> 5);
+        }
+    }
+
+    @Benchmark
+    public void shiftURightAccumulateLong() {
+        for (int i = 0; i < count; i++) {
+            longsD[i] = (longsB[i] >>> 2) + longsA[i];
+        }
+    }
+}
+