8297172: Fix some issues of auto-vectorization of `Long.bitCount/numb…

…erOfTrailingZeros/numberOfLeadingZeros()` Reviewed-by: kvn, thartmann
openjdk · Dec 6, 2022 · 4458de9 · 4458de9 · openjdk-notifier · Dec 6, 2022
1 parent a613998
commit 4458de9
Show file tree

Hide file tree

Showing 11 changed files with 172 additions and 142 deletions.
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -132,6 +132,8 @@ source %{
       // Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
+          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
+          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -5672,7 +5674,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
 // vector popcount - LONG
 
 instruct vpopcountL(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
   match(Set dst (PopCountVL src));
   format %{ "vpopcountL $dst, $src" %}
   ins_encode %{
@@ -5688,32 +5689,6 @@ instruct vpopcountL(vReg dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
-// If the PopCountVL is generated by auto-vectorization, the dst basic
-// type is T_INT. And once we have unified the type definition for
-// Vector API and auto-vectorization, this rule can be merged with
-// "vpopcountL" rule.
-
-instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT);
-  match(Set dst (PopCountVL src));
-  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
-  ins_encode %{
-    if (UseSVE == 0) {
-      __ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
-      __ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
-    } else {
-      __ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
-      __ sve_vector_narrow($dst$$FloatRegister, __ S,
-                           $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
-    }
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // vector popcount - predicated
 
 instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
@@ -5729,7 +5704,7 @@ instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
 %}
 
 instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
-  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
+  predicate(UseSVE > 0);
   match(Set dst_src (PopCountVL dst_src pg));
   format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
   ins_encode %{

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -122,6 +122,8 @@ source %{
       // Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
+          (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
+          (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
           opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
@@ -4055,7 +4057,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
 // vector popcount - LONG
 
 instruct vpopcountL(vReg dst, vReg src) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_LONG);
   match(Set dst (PopCountVL src));
   format %{ "vpopcountL $dst, $src" %}
   ins_encode %{
@@ -4071,37 +4072,11 @@ instruct vpopcountL(vReg dst, vReg src) %{
   ins_pipe(pipe_slow);
 %}
 
-// If the PopCountVL is generated by auto-vectorization, the dst basic
-// type is T_INT. And once we have unified the type definition for
-// Vector API and auto-vectorization, this rule can be merged with
-// "vpopcountL" rule.
-
-instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_INT);
-  match(Set dst (PopCountVL src));
-  effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
-  ins_encode %{
-    if (UseSVE == 0) {
-      __ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
-      __ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
-      __ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
-    } else {
-      __ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
-      __ sve_vector_narrow($dst$$FloatRegister, __ S,
-                           $dst$$FloatRegister, __ D, $tmp$$FloatRegister);
-    }
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // vector popcount - predicated
 UNARY_OP_PREDICATE(vpopcountI, PopCountVI, sve_cnt)
 
 instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
-  predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
+  predicate(UseSVE > 0);
   match(Set dst_src (PopCountVL dst_src pg));
   format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
   ins_encode %{

diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
@@ -8875,12 +8875,6 @@ instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
     int vlen_enc = vector_length_encoding(this, $src);
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
     __ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
-    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
-    // should be succeeded by its corresponding vector IR and following
-    // special handling should be removed.
-    if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
-      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-    }
   %}
   ins_pipe( pipe_slow );
 %}
@@ -8911,18 +8905,6 @@ instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
     __ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
                                 $xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
-    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
-    // should be succeeded by its corresponding vector IR and following
-    // special handling should be removed.
-    if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
-      if (VM_Version::supports_avx512vl()) {
-        __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-      } else {
-        assert(VM_Version::supports_avx2(), "");
-        __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
-        __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
-      }
-    }
   %}
   ins_pipe( pipe_slow );
 %}
@@ -8939,15 +8921,8 @@ instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp)
   ins_encode %{
     int vlen_enc = vector_length_encoding(this, $src);
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
-    BasicType rbt = Matcher::vector_element_basic_type(this);
     __ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
                                         xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
-    // TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
-    // should be succeeded by its corresponding vector IR and following
-    // special handling should be removed.
-    if (bt == T_LONG && rbt == T_INT) {
-      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-    }
   %}
   ins_pipe( pipe_slow );
 %}
@@ -8993,17 +8968,8 @@ instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, v
   ins_encode %{
     int vlen_enc = vector_length_encoding(this, $src);
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
-    BasicType rbt = Matcher::vector_element_basic_type(this);
     __ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
                                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
-    // TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
-    // should be succeeded by its corresponding vector IR and following
-    // special handling should be removed.
-    if (bt == T_LONG && rbt == T_INT) {
-      assert(VM_Version::supports_avx2(), "");
-      __ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
-      __ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
-    }
   %}
   ins_pipe( pipe_slow );
 %}
@@ -9408,15 +9374,8 @@ instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
   ins_encode %{
      int vlen_enc = vector_length_encoding(this, $src);
      BasicType bt = Matcher::vector_element_basic_type(this, $src);
-     BasicType rbt = Matcher::vector_element_basic_type(this);
      __ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
                                         xnoreg, xnoreg, k0, noreg, true, vlen_enc);
-     // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
-     // should be succeeded by its corresponding vector IR and following
-     // special handling should be removed.
-     if (rbt == T_INT && bt == T_LONG) {
-       __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-     }
   %}
   ins_pipe( pipe_slow );
 %}
@@ -9491,15 +9450,8 @@ instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, ve
   ins_encode %{
     int vlen_enc = vector_length_encoding(this, $src);
     BasicType bt = Matcher::vector_element_basic_type(this, $src);
-    BasicType rbt = Matcher::vector_element_basic_type(this);
     __ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
                                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
-    // TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
-    // should be succeeded by its corresponding vector IR and following
-    // special handling should be removed.
-    if (rbt == T_INT && bt == T_LONG) {
-      __ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-    }
   %}
   ins_pipe( pipe_slow );
 %}

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
@@ -2079,6 +2079,14 @@ bool SuperWord::implemented(Node_List* p) {
     } else if (is_cmove_fp_opcode(opc)) {
       retValue = is_cmov_pack(p) && VectorNode::implemented(opc, size, velt_basic_type(p0));
       NOT_PRODUCT(if(retValue && is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmove pack"); print_pack(p);})
+    } else if (requires_long_to_int_conversion(opc)) {
+      // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
+      // returns int type, but Vector API for them returns long type. To unify
+      // the implementation in backend, superword splits the vector implementation
+      // for Java API into an execution node with long type plus another node
+      // converting long to int.
+      retValue = VectorNode::implemented(opc, size, T_LONG) &&
+                 VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
     } else {
       // Vector unsigned right shift for signed subword types behaves differently
       // from Java Spec. But when the shift amount is a constant not greater than
@@ -2096,6 +2104,18 @@ bool SuperWord::implemented(Node_List* p) {
 bool SuperWord::is_cmov_pack(Node_List* p) {
   return _cmovev_kit.pack(p->at(0)) != NULL;
 }
+
+bool SuperWord::requires_long_to_int_conversion(int opc) {
+  switch(opc) {
+    case Op_PopCountL:
+    case Op_CountLeadingZerosL:
+    case Op_CountTrailingZerosL:
+      return true;
+    default:
+      return false;
+  }
+}
+
 //------------------------------same_inputs--------------------------
 // For pack p, are all idx operands the same?
 bool SuperWord::same_inputs(Node_List* p, int idx) {
@@ -2666,16 +2686,28 @@ bool SuperWord::output() {
                  opc == Op_AbsI || opc == Op_AbsL ||
                  opc == Op_NegF || opc == Op_NegD ||
                  opc == Op_RoundF || opc == Op_RoundD ||
-                 opc == Op_PopCountI || opc == Op_PopCountL ||
                  opc == Op_ReverseBytesI || opc == Op_ReverseBytesL ||
                  opc == Op_ReverseBytesUS || opc == Op_ReverseBytesS ||
                  opc == Op_ReverseI || opc == Op_ReverseL ||
-                 opc == Op_CountLeadingZerosI || opc == Op_CountLeadingZerosL ||
-                 opc == Op_CountTrailingZerosI || opc == Op_CountTrailingZerosL) {
+                 opc == Op_PopCountI || opc == Op_CountLeadingZerosI ||
+                 opc == Op_CountTrailingZerosI) {
         assert(n->req() == 2, "only one input expected");
         Node* in = vector_opd(p, 1);
         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
+      } else if (requires_long_to_int_conversion(opc)) {
+        // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
+        // returns int type, but Vector API for them returns long type. To unify
+        // the implementation in backend, superword splits the vector implementation
+        // for Java API into an execution node with long type plus another node
+        // converting long to int.
+        assert(n->req() == 2, "only one input expected");
+        Node* in = vector_opd(p, 1);
+        Node* longval = VectorNode::make(opc, in, NULL, vlen, T_LONG);
+        _igvn.register_new_node_with_optimizer(longval);
+        _phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
+        vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
+        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
       } else if (VectorNode::is_convert_opcode(opc)) {
         assert(n->req() == 2, "only one input expected");
         BasicType bt = velt_basic_type(n);
@@ -3198,27 +3230,11 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
     return true;
   }
 
-  if (VectorNode::is_type_transition_long_to_int(use)) {
-    // PopCountL/CountLeadingZerosL/CountTrailingZerosL takes long and produces
-    // int - hence the special checks on alignment and size.
-    if (u_pk->size() != d_pk->size()) {
-      return false;
-    }
-    for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
-      Node* ui = u_pk->at(i);
-      Node* di = d_pk->at(i);
-      if (alignment(ui) * 2 != alignment(di)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
   if (u_pk->size() != d_pk->size())
     return false;
 
   if (longer_type_for_conversion(use) != T_ILLEGAL) {
-    // type conversion takes a type of a kind of size and produces a type of
+    // These opcodes take a type of a kind of size and produce a type of
     // another size - hence the special checks on alignment and size.
     for (uint i = 0; i < u_pk->size(); i++) {
       Node* ui = u_pk->at(i);
@@ -3467,7 +3483,8 @@ void SuperWord::compute_max_depth() {
 }
 
 BasicType SuperWord::longer_type_for_conversion(Node* n) {
-  if (!VectorNode::is_convert_opcode(n->Opcode()) ||
+  if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
+        requires_long_to_int_conversion(n->Opcode())) ||
       !in_bb(n->in(1))) {
     return T_ILLEGAL;
   }

diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
@@ -457,6 +457,7 @@ class SuperWord : public ResourceObj {
   bool is_cmov_pack(Node_List* p);
   bool is_cmov_pack_internal_node(Node_List* p, Node* nd) { return is_cmov_pack(p) && !nd->is_CMove(); }
   static bool is_cmove_fp_opcode(int opc) { return (opc == Op_CMoveF || opc == Op_CMoveD); }
+  static bool requires_long_to_int_conversion(int opc);
   // For pack p, are all idx operands the same?
   bool same_inputs(Node_List* p, int idx);
   // CloneMap utilities

diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
@@ -343,17 +343,6 @@ bool VectorNode::is_muladds2i(Node* n) {
   return false;
 }
 
-bool VectorNode::is_type_transition_long_to_int(Node* n) {
-  switch(n->Opcode()) {
-    case Op_PopCountL:
-    case Op_CountLeadingZerosL:
-    case Op_CountTrailingZerosL:
-       return true;
-    default:
-       return false;
-  }
-}
-
 bool VectorNode::is_roundopD(Node* n) {
   if (n->Opcode() == Op_RoundDoubleMode) {
     return true;

diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
@@ -99,7 +99,6 @@ class VectorNode : public TypeNode {
   static bool is_type_transition_short_to_int(Node* n);
   static bool is_type_transition_to_int(Node* n);
   static bool is_muladds2i(Node* n);
-  static bool is_type_transition_long_to_int(Node* n);
   static bool is_roundopD(Node* n);
   static bool is_scalar_rotate(Node* n);
   static bool is_vector_rotate_supported(int opc, uint vlen, BasicType bt);
@@ -551,7 +550,9 @@ class PopCountVINode : public VectorNode {
 // Vector popcount long bits
 class PopCountVLNode : public VectorNode {
  public:
-  PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+  PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {
+    assert(vt->element_basic_type() == T_LONG, "must be long");
+  }
   virtual int Opcode() const;
 };
 
@@ -1732,15 +1733,21 @@ class RotateLeftVNode : public VectorNode {
 class CountLeadingZerosVNode : public VectorNode {
  public:
   CountLeadingZerosVNode(Node* in, const TypeVect* vt)
-  : VectorNode(in, vt) {}
+  : VectorNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
+           "must be the same");
+  }
 
   virtual int Opcode() const;
 };
 
 class CountTrailingZerosVNode : public VectorNode {
  public:
   CountTrailingZerosVNode(Node* in, const TypeVect* vt)
-  : VectorNode(in, vt) {}
+  : VectorNode(in, vt) {
+    assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
+           "must be the same");
+  }
 
   virtual int Opcode() const;
 };