8295276: AArch64: Add backend support for half float conversion intri…

…nsics This patch adds aarch64 backend support for library intrinsics that implement conversions between half-precision and single-precision floats. Ran the following benchmarks to assess the performance with this patch - org.openjdk.bench.java.math.Fp16ConversionBenchmark.floatToFloat16 org.openjdk.bench.java.math.Fp16ConversionBenchmark.float16ToFloat The performance (ops/ms) gain with the patch on an ARM NEON machine is shown below - Benchmark Gain Fp16ConversionBenchmark.float16ToFloat 3.42 Fp16ConversionBenchmark.floatToFloat16 5.85
openjdk · Oct 20, 2022 · 121f543 · 121f543
1 parent 9b97162
commit 121f543
Show file tree

Hide file tree

Showing 4 changed files with 658 additions and 624 deletions.
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -14578,6 +14578,32 @@ instruct convF2L_reg_reg(iRegLNoSp dst, vRegF src) %{
   ins_pipe(fp_f2l);
 %}
 
+instruct convF2HF_reg_reg(iRegINoSp dst, vRegF src, vRegF tmp) %{
+  match(Set dst (ConvF2HF src));
+  format %{ "fcvt $tmp, $src\t# convert single to half precision\n\t"
+            "smov $dst, $tmp\t# move result from $tmp to $dst"
+  %}
+  effect(TEMP tmp);
+  ins_encode %{
+      __ fcvtsh($tmp$$FloatRegister, $src$$FloatRegister);
+      __ smov($dst$$Register, $tmp$$FloatRegister, __ H, 0);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct convHF2F_reg_reg(vRegF dst, iRegINoSp src, vRegF tmp) %{
+  match(Set dst (ConvHF2F src));
+  format %{ "mov $tmp, $src\t# move source from $src to $tmp\n\t"
+            "fcvt $dst, $tmp\t# convert half to single precision"
+  %}
+  effect(TEMP tmp);
+  ins_encode %{
+      __ mov($tmp$$FloatRegister, __ H, 0, $src$$Register);
+      __ fcvths($dst$$FloatRegister, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct convI2F_reg_reg(vRegF dst, iRegIorL2I src) %{
   match(Set dst (ConvI2F src));
 

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -1907,31 +1907,33 @@ void mvnw(Register Rd, Register Rm,
 #undef INSN
 
   // Floating-point data-processing (1 source)
-  void data_processing(unsigned op31, unsigned type, unsigned opcode,
+  void data_processing(unsigned type, unsigned opcode,
                        FloatRegister Vd, FloatRegister Vn) {
     starti;
-    f(op31, 31, 29);
+    f(0b000, 31, 29);
     f(0b11110, 28, 24);
     f(type, 23, 22), f(1, 21), f(opcode, 20, 15), f(0b10000, 14, 10);
     rf(Vn, 5), rf(Vd, 0);
   }
 
-#define INSN(NAME, op31, type, opcode)                  \
+#define INSN(NAME, type, opcode)                        \
   void NAME(FloatRegister Vd, FloatRegister Vn) {       \
-    data_processing(op31, type, opcode, Vd, Vn);        \
-  }
-
-  INSN(fmovs,  0b000, 0b00, 0b000000);
-  INSN(fabss,  0b000, 0b00, 0b000001);
-  INSN(fnegs,  0b000, 0b00, 0b000010);
-  INSN(fsqrts, 0b000, 0b00, 0b000011);
-  INSN(fcvts,  0b000, 0b00, 0b000101);   // Single-precision to double-precision
-
-  INSN(fmovd,  0b000, 0b01, 0b000000);
-  INSN(fabsd,  0b000, 0b01, 0b000001);
-  INSN(fnegd,  0b000, 0b01, 0b000010);
-  INSN(fsqrtd, 0b000, 0b01, 0b000011);
-  INSN(fcvtd,  0b000, 0b01, 0b000100);   // Double-precision to single-precision
+    data_processing(type, opcode, Vd, Vn);              \
+  }
+
+  INSN(fmovs,  0b00, 0b000000);
+  INSN(fabss,  0b00, 0b000001);
+  INSN(fnegs,  0b00, 0b000010);
+  INSN(fsqrts, 0b00, 0b000011);
+  INSN(fcvts,  0b00, 0b000101);   // Single-precision to double-precision
+  INSN(fcvths, 0b11, 0b000100);   // Half-precision to single-precision
+  INSN(fcvtsh, 0b00, 0b000111);   // Single-precision to half-precision
+
+  INSN(fmovd,  0b01, 0b000000);
+  INSN(fabsd,  0b01, 0b000001);
+  INSN(fnegd,  0b01, 0b000010);
+  INSN(fsqrtd, 0b01, 0b000011);
+  INSN(fcvtd,  0b01, 0b000100);   // Double-precision to single-precision
 
 private:
   void _fcvt_narrow_extend(FloatRegister Vd, SIMD_Arrangement Ta,

diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@@ -957,7 +957,9 @@ def cstr(self):
 class FloatInstruction(Instruction):
 
     def aname(self):
-        if (self._name.endswith("s") | self._name.endswith("d")):
+        if (self._name in ["fcvtsh", "fcvths"]):
+            return self._name[:len(self._name)-2]
+        elif (self._name.endswith("s") | self._name.endswith("d")):
             return self._name[:len(self._name)-1]
         else:
             return self._name
@@ -1012,6 +1014,8 @@ def __init__(self, args):
         elif not self._isPredicated and (name in ["and", "eor", "orr", "bic"]):
             self._width = RegVariant(3, 3)
             self._bitwiseop = True
+        elif name == "revb":
+            self._width = RegVariant(1, 3)
         else:
             self._width = RegVariant(0, 3)
 
@@ -1458,7 +1462,7 @@ def generate(kind, names):
 
 generate(TwoRegFloatOp,
          [["fmovs", "ss"], ["fabss", "ss"], ["fnegs", "ss"], ["fsqrts", "ss"],
-          ["fcvts", "ds"],
+          ["fcvts", "ds"], ["fcvtsh", "hs"], ["fcvths", "sh"],
           ["fmovd", "dd"], ["fabsd", "dd"], ["fnegd", "dd"], ["fsqrtd", "dd"],
           ["fcvtd", "sd"],
           ])