Skip to content

Commit

Permalink
8297172: Fix some issues of auto-vectorization of `Long.bitCount/numb…
Browse files Browse the repository at this point in the history
…erOfTrailingZeros/numberOfLeadingZeros()`

Reviewed-by: kvn, thartmann
  • Loading branch information
Fei Gao authored and Pengfei Li committed Dec 6, 2022
1 parent a613998 commit 4458de9
Show file tree
Hide file tree
Showing 11 changed files with 172 additions and 142 deletions.
31 changes: 3 additions & 28 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ source %{
// Vector API intrinsics.
if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -5672,7 +5674,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
// vector popcount - LONG

instruct vpopcountL(vReg dst, vReg src) %{
predicate(Matcher::vector_element_basic_type(n) == T_LONG);
match(Set dst (PopCountVL src));
format %{ "vpopcountL $dst, $src" %}
ins_encode %{
Expand All @@ -5688,32 +5689,6 @@ instruct vpopcountL(vReg dst, vReg src) %{
ins_pipe(pipe_slow);
%}

// If the PopCountVL is generated by auto-vectorization, the dst basic
// type is T_INT. And once we have unified the type definition for
// Vector API and auto-vectorization, this rule can be merged with
// "vpopcountL" rule.

instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
predicate(Matcher::vector_element_basic_type(n) == T_INT);
match(Set dst (PopCountVL src));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
ins_encode %{
if (UseSVE == 0) {
__ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
} else {
__ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
__ sve_vector_narrow($dst$$FloatRegister, __ S,
$dst$$FloatRegister, __ D, $tmp$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

// vector popcount - predicated

instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
Expand All @@ -5729,7 +5704,7 @@ instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
%}

instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
predicate(UseSVE > 0);
match(Set dst_src (PopCountVL dst_src pg));
format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
ins_encode %{
Expand Down
31 changes: 3 additions & 28 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ source %{
// Vector API intrinsics.
if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
opcode == Op_MulVL) {
Expand Down Expand Up @@ -4055,7 +4057,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
// vector popcount - LONG

instruct vpopcountL(vReg dst, vReg src) %{
predicate(Matcher::vector_element_basic_type(n) == T_LONG);
match(Set dst (PopCountVL src));
format %{ "vpopcountL $dst, $src" %}
ins_encode %{
Expand All @@ -4071,37 +4072,11 @@ instruct vpopcountL(vReg dst, vReg src) %{
ins_pipe(pipe_slow);
%}

// If the PopCountVL is generated by auto-vectorization, the dst basic
// type is T_INT. And once we have unified the type definition for
// Vector API and auto-vectorization, this rule can be merged with
// "vpopcountL" rule.

instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
predicate(Matcher::vector_element_basic_type(n) == T_INT);
match(Set dst (PopCountVL src));
effect(TEMP_DEF dst, TEMP tmp);
format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
ins_encode %{
if (UseSVE == 0) {
__ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
__ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
__ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
} else {
__ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
__ sve_vector_narrow($dst$$FloatRegister, __ S,
$dst$$FloatRegister, __ D, $tmp$$FloatRegister);
}
%}
ins_pipe(pipe_slow);
%}

// vector popcount - predicated
UNARY_OP_PREDICATE(vpopcountI, PopCountVI, sve_cnt)

instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
predicate(UseSVE > 0);
match(Set dst_src (PopCountVL dst_src pg));
format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
ins_encode %{
Expand Down
48 changes: 0 additions & 48 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -8875,12 +8875,6 @@ instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
int vlen_enc = vector_length_encoding(this, $src);
BasicType bt = Matcher::vector_element_basic_type(this, $src);
__ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
%}
ins_pipe( pipe_slow );
%}
Expand Down Expand Up @@ -8911,18 +8905,6 @@ instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %
BasicType bt = Matcher::vector_element_basic_type(this, $src);
__ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
if (VM_Version::supports_avx512vl()) {
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
} else {
assert(VM_Version::supports_avx2(), "");
__ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
}
}
%}
ins_pipe( pipe_slow );
%}
Expand All @@ -8939,15 +8921,8 @@ instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp)
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src);
BasicType bt = Matcher::vector_element_basic_type(this, $src);
BasicType rbt = Matcher::vector_element_basic_type(this);
__ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (bt == T_LONG && rbt == T_INT) {
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
%}
ins_pipe( pipe_slow );
%}
Expand Down Expand Up @@ -8993,17 +8968,8 @@ instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, v
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src);
BasicType bt = Matcher::vector_element_basic_type(this, $src);
BasicType rbt = Matcher::vector_element_basic_type(this);
__ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (bt == T_LONG && rbt == T_INT) {
assert(VM_Version::supports_avx2(), "");
__ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
}
%}
ins_pipe( pipe_slow );
%}
Expand Down Expand Up @@ -9408,15 +9374,8 @@ instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src);
BasicType bt = Matcher::vector_element_basic_type(this, $src);
BasicType rbt = Matcher::vector_element_basic_type(this);
__ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
xnoreg, xnoreg, k0, noreg, true, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (rbt == T_INT && bt == T_LONG) {
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
%}
ins_pipe( pipe_slow );
%}
Expand Down Expand Up @@ -9491,15 +9450,8 @@ instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, ve
ins_encode %{
int vlen_enc = vector_length_encoding(this, $src);
BasicType bt = Matcher::vector_element_basic_type(this, $src);
BasicType rbt = Matcher::vector_element_basic_type(this);
__ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
// TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
// should be succeeded by its corresponding vector IR and following
// special handling should be removed.
if (rbt == T_INT && bt == T_LONG) {
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
}
%}
ins_pipe( pipe_slow );
%}
Expand Down
59 changes: 38 additions & 21 deletions src/hotspot/share/opto/superword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2079,6 +2079,14 @@ bool SuperWord::implemented(Node_List* p) {
} else if (is_cmove_fp_opcode(opc)) {
retValue = is_cmov_pack(p) && VectorNode::implemented(opc, size, velt_basic_type(p0));
NOT_PRODUCT(if(retValue && is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmove pack"); print_pack(p);})
} else if (requires_long_to_int_conversion(opc)) {
// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
// returns int type, but Vector API for them returns long type. To unify
// the implementation in backend, superword splits the vector implementation
// for Java API into an execution node with long type plus another node
// converting long to int.
retValue = VectorNode::implemented(opc, size, T_LONG) &&
VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
} else {
// Vector unsigned right shift for signed subword types behaves differently
// from Java Spec. But when the shift amount is a constant not greater than
Expand All @@ -2096,6 +2104,18 @@ bool SuperWord::implemented(Node_List* p) {
bool SuperWord::is_cmov_pack(Node_List* p) {
return _cmovev_kit.pack(p->at(0)) != NULL;
}

bool SuperWord::requires_long_to_int_conversion(int opc) {
switch(opc) {
case Op_PopCountL:
case Op_CountLeadingZerosL:
case Op_CountTrailingZerosL:
return true;
default:
return false;
}
}

//------------------------------same_inputs--------------------------
// For pack p, are all idx operands the same?
bool SuperWord::same_inputs(Node_List* p, int idx) {
Expand Down Expand Up @@ -2666,16 +2686,28 @@ bool SuperWord::output() {
opc == Op_AbsI || opc == Op_AbsL ||
opc == Op_NegF || opc == Op_NegD ||
opc == Op_RoundF || opc == Op_RoundD ||
opc == Op_PopCountI || opc == Op_PopCountL ||
opc == Op_ReverseBytesI || opc == Op_ReverseBytesL ||
opc == Op_ReverseBytesUS || opc == Op_ReverseBytesS ||
opc == Op_ReverseI || opc == Op_ReverseL ||
opc == Op_CountLeadingZerosI || opc == Op_CountLeadingZerosL ||
opc == Op_CountTrailingZerosI || opc == Op_CountTrailingZerosL) {
opc == Op_PopCountI || opc == Op_CountLeadingZerosI ||
opc == Op_CountTrailingZerosI) {
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (requires_long_to_int_conversion(opc)) {
// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
// returns int type, but Vector API for them returns long type. To unify
// the implementation in backend, superword splits the vector implementation
// for Java API into an execution node with long type plus another node
// converting long to int.
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
Node* longval = VectorNode::make(opc, in, NULL, vlen, T_LONG);
_igvn.register_new_node_with_optimizer(longval);
_phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} else if (VectorNode::is_convert_opcode(opc)) {
assert(n->req() == 2, "only one input expected");
BasicType bt = velt_basic_type(n);
Expand Down Expand Up @@ -3198,27 +3230,11 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
return true;
}

if (VectorNode::is_type_transition_long_to_int(use)) {
// PopCountL/CountLeadingZerosL/CountTrailingZerosL takes long and produces
// int - hence the special checks on alignment and size.
if (u_pk->size() != d_pk->size()) {
return false;
}
for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
Node* ui = u_pk->at(i);
Node* di = d_pk->at(i);
if (alignment(ui) * 2 != alignment(di)) {
return false;
}
}
return true;
}

if (u_pk->size() != d_pk->size())
return false;

if (longer_type_for_conversion(use) != T_ILLEGAL) {
// type conversion takes a type of a kind of size and produces a type of
// These opcodes take a type of a kind of size and produce a type of
// another size - hence the special checks on alignment and size.
for (uint i = 0; i < u_pk->size(); i++) {
Node* ui = u_pk->at(i);
Expand Down Expand Up @@ -3467,7 +3483,8 @@ void SuperWord::compute_max_depth() {
}

BasicType SuperWord::longer_type_for_conversion(Node* n) {
if (!VectorNode::is_convert_opcode(n->Opcode()) ||
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
requires_long_to_int_conversion(n->Opcode())) ||
!in_bb(n->in(1))) {
return T_ILLEGAL;
}
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/superword.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ class SuperWord : public ResourceObj {
bool is_cmov_pack(Node_List* p);
bool is_cmov_pack_internal_node(Node_List* p, Node* nd) { return is_cmov_pack(p) && !nd->is_CMove(); }
static bool is_cmove_fp_opcode(int opc) { return (opc == Op_CMoveF || opc == Op_CMoveD); }
static bool requires_long_to_int_conversion(int opc);
// For pack p, are all idx operands the same?
bool same_inputs(Node_List* p, int idx);
// CloneMap utilities
Expand Down
11 changes: 0 additions & 11 deletions src/hotspot/share/opto/vectornode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -343,17 +343,6 @@ bool VectorNode::is_muladds2i(Node* n) {
return false;
}

bool VectorNode::is_type_transition_long_to_int(Node* n) {
switch(n->Opcode()) {
case Op_PopCountL:
case Op_CountLeadingZerosL:
case Op_CountTrailingZerosL:
return true;
default:
return false;
}
}

bool VectorNode::is_roundopD(Node* n) {
if (n->Opcode() == Op_RoundDoubleMode) {
return true;
Expand Down
15 changes: 11 additions & 4 deletions src/hotspot/share/opto/vectornode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ class VectorNode : public TypeNode {
static bool is_type_transition_short_to_int(Node* n);
static bool is_type_transition_to_int(Node* n);
static bool is_muladds2i(Node* n);
static bool is_type_transition_long_to_int(Node* n);
static bool is_roundopD(Node* n);
static bool is_scalar_rotate(Node* n);
static bool is_vector_rotate_supported(int opc, uint vlen, BasicType bt);
Expand Down Expand Up @@ -551,7 +550,9 @@ class PopCountVINode : public VectorNode {
// Vector popcount long bits
class PopCountVLNode : public VectorNode {
public:
PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {
assert(vt->element_basic_type() == T_LONG, "must be long");
}
virtual int Opcode() const;
};

Expand Down Expand Up @@ -1732,15 +1733,21 @@ class RotateLeftVNode : public VectorNode {
class CountLeadingZerosVNode : public VectorNode {
public:
CountLeadingZerosVNode(Node* in, const TypeVect* vt)
: VectorNode(in, vt) {}
: VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
"must be the same");
}

virtual int Opcode() const;
};

class CountTrailingZerosVNode : public VectorNode {
public:
CountTrailingZerosVNode(Node* in, const TypeVect* vt)
: VectorNode(in, vt) {}
: VectorNode(in, vt) {
assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
"must be the same");
}

virtual int Opcode() const;
};
Expand Down
Loading

1 comment on commit 4458de9

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.