8239549: AArch64: Backend support for MulAddVS2VI node

Reviewed-by: aph
openjdk · Feb 26, 2020 · 934db29 · 934db29
1 parent e6f0c6b
commit 934db29
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 10 deletions.
diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -2074,15 +2074,24 @@ const bool Matcher::match_rule_supported(int opcode) {
   return ret_value; // Per default match rules are supported.
 }
 
+// Identify extra cases that we might want to provide match rules for vector nodes and
+// other intrinsics guarded with vector length (vlen) and element type (bt).
 const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
+  if (!match_rule_supported(opcode)) {
+    return false;
+  }
 
-  // TODO
-  // identify extra cases that we might want to provide match rules for
-  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
-  bool ret_value = match_rule_supported(opcode);
-  // Add rules here.
+  // Special cases which require vector length
+  switch (opcode) {
+    case Op_MulAddVS2VI: {
+      if (vlen != 4) {
+        return false;
+      }
+      break;
+    }
+  }
 
-  return ret_value;  // Per default match rules are supported.
+  return true; // Per default match rules are supported.
 }
 
 const bool Matcher::has_predicated_vectors(void) {
@@ -10555,6 +10564,22 @@ instruct smnegL(iRegLNoSp dst, iRegIorL2I src1, iRegIorL2I src2, immL0 zero) %{
   ins_pipe(imac_reg_reg);
 %}
 
+// Combined Multiply-Add Shorts into Integer (dst = src1 * src2 + src3 * src4)
+
+instruct muladdS2I(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2, iRegIorL2I src3, iRegIorL2I src4) %{
+  match(Set dst (MulAddS2I (Binary src1 src2) (Binary src3 src4)));
+
+  ins_cost(INSN_COST * 5);
+  format %{ "mulw  rscratch1, $src1, $src2\n\t"
+            "maddw $dst, $src3, $src4, rscratch1" %}
+
+  ins_encode %{
+    __ mulw(rscratch1, as_Register($src1$$reg), as_Register($src2$$reg));
+    __ maddw(as_Register($dst$$reg), as_Register($src3$$reg), as_Register($src4$$reg), rscratch1); %}
+
+  ins_pipe(imac_reg_reg);
+%}
+
 // Integer Divide
 
 instruct divI(iRegINoSp dst, iRegIorL2I src1, iRegIorL2I src2) %{
@@ -16935,6 +16960,30 @@ instruct vmls2D(vecX dst, vecX src1, vecX src2) %{
   ins_pipe(vmuldiv_fp128);
 %}
 
+// --------------- Vector Multiply-Add Shorts into Integer --------------------
+
+instruct vmuladdS2I(vecX dst, vecX src1, vecX src2, vecX tmp) %{
+  predicate(n->in(1)->bottom_type()->is_vect()->element_basic_type() == T_SHORT);
+  match(Set dst (MulAddVS2VI src1 src2));
+  ins_cost(INSN_COST);
+  effect(TEMP tmp);
+  format %{ "smullv  $tmp, $src1, $src2\t# vector (4H)\n\t"
+            "smullv  $dst, $src1, $src2\t# vector (8H)\n\t"
+            "addpv   $dst, $tmp, $dst\t# vector (4S)\n\t" %}
+  ins_encode %{
+    __ smullv(as_FloatRegister($tmp$$reg), __ T4H,
+              as_FloatRegister($src1$$reg),
+              as_FloatRegister($src2$$reg));
+    __ smullv(as_FloatRegister($dst$$reg), __ T8H,
+              as_FloatRegister($src1$$reg),
+              as_FloatRegister($src2$$reg));
+    __ addpv(as_FloatRegister($dst$$reg), __ T4S,
+             as_FloatRegister($tmp$$reg),
+             as_FloatRegister($dst$$reg));
+  %}
+  ins_pipe(vmuldiv_fp128);
+%}
+
 // --------------------------------- DIV --------------------------------------
 
 instruct vdiv2F(vecD dst, vecD src1, vecD src2)

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -2259,6 +2259,8 @@ void mvnw(Register Rd, Register Rm,
   INSN(mlsv,   1, 0b100101, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
   INSN(sshl,   0, 0b010001, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
   INSN(ushl,   1, 0b010001, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
+  INSN(addpv,  0, 0b101111, true);  // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
+  INSN(smullv, 0, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
   INSN(umullv, 1, 0b110000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
   INSN(umlalv, 1, 0b100000, false); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
 

diff --git a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2015, 2019, Red Hat Inc. All rights reserved.
+ * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2020, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -451,6 +451,10 @@ void VM_Version::get_processor_features() {
   if (FLAG_IS_DEFAULT(OptoScheduling)) {
     OptoScheduling = true;
   }
+
+  if (FLAG_IS_DEFAULT(AlignVector)) {
+    AlignVector = AvoidUnalignedAccesses;
+  }
 #endif
 }
 

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java b/test/hotspot/jtreg/compiler/loopopts/superword/Vec_MulAddS2I.java
@@ -24,8 +24,7 @@
 /**
  * @test
  * @bug 8214751
- * @summary Add C2 x86 Superword support for VNNI VPDPWSSD Instruction
- * @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
+ * @summary Test operations in C2 MulAddS2I and MulAddVS2VI nodes.
  *
  * @run main/othervm -XX:LoopUnrollLimit=250
  *      -XX:CompileThresholdScaling=0.1