Skip to content

Commit 4458de9

Browse files
Fei GaoPengfei Li
Fei Gao
authored and
Pengfei Li
committed
8297172: Fix some issues of auto-vectorization of Long.bitCount/numberOfTrailingZeros/numberOfLeadingZeros()
Reviewed-by: kvn, thartmann
1 parent a613998 commit 4458de9

File tree

11 files changed

+172
-142
lines changed

11 files changed

+172
-142
lines changed

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ source %{
132132
// Vector API intrinsics.
133133
if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
134134
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
135+
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
136+
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
135137
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
136138
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
137139
opcode == Op_MulVL) {
@@ -5672,7 +5674,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
56725674
// vector popcount - LONG
56735675

56745676
instruct vpopcountL(vReg dst, vReg src) %{
5675-
predicate(Matcher::vector_element_basic_type(n) == T_LONG);
56765677
match(Set dst (PopCountVL src));
56775678
format %{ "vpopcountL $dst, $src" %}
56785679
ins_encode %{
@@ -5688,32 +5689,6 @@ instruct vpopcountL(vReg dst, vReg src) %{
56885689
ins_pipe(pipe_slow);
56895690
%}
56905691

5691-
// If the PopCountVL is generated by auto-vectorization, the dst basic
5692-
// type is T_INT. And once we have unified the type definition for
5693-
// Vector API and auto-vectorization, this rule can be merged with
5694-
// "vpopcountL" rule.
5695-
5696-
instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
5697-
predicate(Matcher::vector_element_basic_type(n) == T_INT);
5698-
match(Set dst (PopCountVL src));
5699-
effect(TEMP_DEF dst, TEMP tmp);
5700-
format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
5701-
ins_encode %{
5702-
if (UseSVE == 0) {
5703-
__ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
5704-
__ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
5705-
__ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
5706-
__ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
5707-
__ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
5708-
} else {
5709-
__ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
5710-
__ sve_vector_narrow($dst$$FloatRegister, __ S,
5711-
$dst$$FloatRegister, __ D, $tmp$$FloatRegister);
5712-
}
5713-
%}
5714-
ins_pipe(pipe_slow);
5715-
%}
5716-
57175692
// vector popcount - predicated
57185693

57195694
instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
@@ -5729,7 +5704,7 @@ instruct vpopcountI_masked(vReg dst_src, pRegGov pg) %{
57295704
%}
57305705

57315706
instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
5732-
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
5707+
predicate(UseSVE > 0);
57335708
match(Set dst_src (PopCountVL dst_src pg));
57345709
format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
57355710
ins_encode %{

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ source %{
122122
// Vector API intrinsics.
123123
if ((opcode == Op_VectorCastD2X && bt == T_INT) ||
124124
(opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
125+
(opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
126+
(opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
125127
opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
126128
opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
127129
opcode == Op_MulVL) {
@@ -4055,7 +4057,6 @@ instruct vpopcountI(vReg dst, vReg src) %{
40554057
// vector popcount - LONG
40564058

40574059
instruct vpopcountL(vReg dst, vReg src) %{
4058-
predicate(Matcher::vector_element_basic_type(n) == T_LONG);
40594060
match(Set dst (PopCountVL src));
40604061
format %{ "vpopcountL $dst, $src" %}
40614062
ins_encode %{
@@ -4071,37 +4072,11 @@ instruct vpopcountL(vReg dst, vReg src) %{
40714072
ins_pipe(pipe_slow);
40724073
%}
40734074

4074-
// If the PopCountVL is generated by auto-vectorization, the dst basic
4075-
// type is T_INT. And once we have unified the type definition for
4076-
// Vector API and auto-vectorization, this rule can be merged with
4077-
// "vpopcountL" rule.
4078-
4079-
instruct vpopcountL_I(vReg dst, vReg src, vReg tmp) %{
4080-
predicate(Matcher::vector_element_basic_type(n) == T_INT);
4081-
match(Set dst (PopCountVL src));
4082-
effect(TEMP_DEF dst, TEMP tmp);
4083-
format %{ "vpopcountL_I $dst, $src\t# KILL $tmp" %}
4084-
ins_encode %{
4085-
if (UseSVE == 0) {
4086-
__ cnt($dst$$FloatRegister, __ T16B, $src$$FloatRegister);
4087-
__ uaddlp($dst$$FloatRegister, __ T16B, $dst$$FloatRegister);
4088-
__ uaddlp($dst$$FloatRegister, __ T8H, $dst$$FloatRegister);
4089-
__ uaddlp($dst$$FloatRegister, __ T4S, $dst$$FloatRegister);
4090-
__ xtn($dst$$FloatRegister, __ T2S, $dst$$FloatRegister, __ T2D);
4091-
} else {
4092-
__ sve_cnt($dst$$FloatRegister, __ D, ptrue, $src$$FloatRegister);
4093-
__ sve_vector_narrow($dst$$FloatRegister, __ S,
4094-
$dst$$FloatRegister, __ D, $tmp$$FloatRegister);
4095-
}
4096-
%}
4097-
ins_pipe(pipe_slow);
4098-
%}
4099-
41004075
// vector popcount - predicated
41014076
UNARY_OP_PREDICATE(vpopcountI, PopCountVI, sve_cnt)
41024077

41034078
instruct vpopcountL_masked(vReg dst_src, pRegGov pg) %{
4104-
predicate(UseSVE > 0 && Matcher::vector_element_basic_type(n) == T_LONG);
4079+
predicate(UseSVE > 0);
41054080
match(Set dst_src (PopCountVL dst_src pg));
41064081
format %{ "vpopcountL_masked $dst_src, $pg, $dst_src" %}
41074082
ins_encode %{

src/hotspot/cpu/x86/x86.ad

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -8875,12 +8875,6 @@ instruct vpopcount_integral_reg_evex(vec dst, vec src) %{
88758875
int vlen_enc = vector_length_encoding(this, $src);
88768876
BasicType bt = Matcher::vector_element_basic_type(this, $src);
88778877
__ vector_popcount_integral_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, k0, true, vlen_enc);
8878-
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
8879-
// should be succeeded by its corresponding vector IR and following
8880-
// special handling should be removed.
8881-
if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
8882-
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8883-
}
88848878
%}
88858879
ins_pipe( pipe_slow );
88868880
%}
@@ -8911,18 +8905,6 @@ instruct vpopcount_avx_reg(vec dst, vec src, vec xtmp1, vec xtmp2, rRegP rtmp) %
89118905
BasicType bt = Matcher::vector_element_basic_type(this, $src);
89128906
__ vector_popcount_integral(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
89138907
$xtmp2$$XMMRegister, $rtmp$$Register, vlen_enc);
8914-
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
8915-
// should be succeeded by its corresponding vector IR and following
8916-
// special handling should be removed.
8917-
if (opcode == Op_PopCountVL && Matcher::vector_element_basic_type(this) == T_INT) {
8918-
if (VM_Version::supports_avx512vl()) {
8919-
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8920-
} else {
8921-
assert(VM_Version::supports_avx2(), "");
8922-
__ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
8923-
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
8924-
}
8925-
}
89268908
%}
89278909
ins_pipe( pipe_slow );
89288910
%}
@@ -8939,15 +8921,8 @@ instruct vcount_trailing_zeros_reg_evex(vec dst, vec src, vec xtmp, rRegP rtmp)
89398921
ins_encode %{
89408922
int vlen_enc = vector_length_encoding(this, $src);
89418923
BasicType bt = Matcher::vector_element_basic_type(this, $src);
8942-
BasicType rbt = Matcher::vector_element_basic_type(this);
89438924
__ vector_count_trailing_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
89448925
xnoreg, xnoreg, $xtmp$$XMMRegister, k0, $rtmp$$Register, vlen_enc);
8945-
// TODO: Once auto-vectorizer supports ConvL2I operation, CountTrailingZerosV
8946-
// should be succeeded by its corresponding vector IR and following
8947-
// special handling should be removed.
8948-
if (bt == T_LONG && rbt == T_INT) {
8949-
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8950-
}
89518926
%}
89528927
ins_pipe( pipe_slow );
89538928
%}
@@ -8993,17 +8968,8 @@ instruct vcount_trailing_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, v
89938968
ins_encode %{
89948969
int vlen_enc = vector_length_encoding(this, $src);
89958970
BasicType bt = Matcher::vector_element_basic_type(this, $src);
8996-
BasicType rbt = Matcher::vector_element_basic_type(this);
89978971
__ vector_count_trailing_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
89988972
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
8999-
// TODO: Once auto-vectorizer supports ConvL2I operation, PopCountVL
9000-
// should be succeeded by its corresponding vector IR and following
9001-
// special handling should be removed.
9002-
if (bt == T_LONG && rbt == T_INT) {
9003-
assert(VM_Version::supports_avx2(), "");
9004-
__ vpshufd($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
9005-
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 8, vlen_enc);
9006-
}
90078973
%}
90088974
ins_pipe( pipe_slow );
90098975
%}
@@ -9408,15 +9374,8 @@ instruct vcount_leading_zeros_IL_reg_evex(vec dst, vec src) %{
94089374
ins_encode %{
94099375
int vlen_enc = vector_length_encoding(this, $src);
94109376
BasicType bt = Matcher::vector_element_basic_type(this, $src);
9411-
BasicType rbt = Matcher::vector_element_basic_type(this);
94129377
__ vector_count_leading_zeros_evex(bt, $dst$$XMMRegister, $src$$XMMRegister, xnoreg,
94139378
xnoreg, xnoreg, k0, noreg, true, vlen_enc);
9414-
// TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
9415-
// should be succeeded by its corresponding vector IR and following
9416-
// special handling should be removed.
9417-
if (rbt == T_INT && bt == T_LONG) {
9418-
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
9419-
}
94209379
%}
94219380
ins_pipe( pipe_slow );
94229381
%}
@@ -9491,15 +9450,8 @@ instruct vcount_leading_zeros_reg_avx(vec dst, vec src, vec xtmp1, vec xtmp2, ve
94919450
ins_encode %{
94929451
int vlen_enc = vector_length_encoding(this, $src);
94939452
BasicType bt = Matcher::vector_element_basic_type(this, $src);
9494-
BasicType rbt = Matcher::vector_element_basic_type(this);
94959453
__ vector_count_leading_zeros_avx(bt, $dst$$XMMRegister, $src$$XMMRegister, $xtmp1$$XMMRegister,
94969454
$xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, vlen_enc);
9497-
// TODO: Once auto-vectorizer supports ConvL2I operation, CountLeadingZerosV
9498-
// should be succeeded by its corresponding vector IR and following
9499-
// special handling should be removed.
9500-
if (rbt == T_INT && bt == T_LONG) {
9501-
__ evpmovqd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
9502-
}
95039455
%}
95049456
ins_pipe( pipe_slow );
95059457
%}

src/hotspot/share/opto/superword.cpp

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2079,6 +2079,14 @@ bool SuperWord::implemented(Node_List* p) {
20792079
} else if (is_cmove_fp_opcode(opc)) {
20802080
retValue = is_cmov_pack(p) && VectorNode::implemented(opc, size, velt_basic_type(p0));
20812081
NOT_PRODUCT(if(retValue && is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmove pack"); print_pack(p);})
2082+
} else if (requires_long_to_int_conversion(opc)) {
2083+
// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
2084+
// returns int type, but Vector API for them returns long type. To unify
2085+
// the implementation in backend, superword splits the vector implementation
2086+
// for Java API into an execution node with long type plus another node
2087+
// converting long to int.
2088+
retValue = VectorNode::implemented(opc, size, T_LONG) &&
2089+
VectorCastNode::implemented(Op_ConvL2I, size, T_LONG, T_INT);
20822090
} else {
20832091
// Vector unsigned right shift for signed subword types behaves differently
20842092
// from Java Spec. But when the shift amount is a constant not greater than
@@ -2096,6 +2104,18 @@ bool SuperWord::implemented(Node_List* p) {
20962104
bool SuperWord::is_cmov_pack(Node_List* p) {
20972105
return _cmovev_kit.pack(p->at(0)) != NULL;
20982106
}
2107+
2108+
bool SuperWord::requires_long_to_int_conversion(int opc) {
2109+
switch(opc) {
2110+
case Op_PopCountL:
2111+
case Op_CountLeadingZerosL:
2112+
case Op_CountTrailingZerosL:
2113+
return true;
2114+
default:
2115+
return false;
2116+
}
2117+
}
2118+
20992119
//------------------------------same_inputs--------------------------
21002120
// For pack p, are all idx operands the same?
21012121
bool SuperWord::same_inputs(Node_List* p, int idx) {
@@ -2666,16 +2686,28 @@ bool SuperWord::output() {
26662686
opc == Op_AbsI || opc == Op_AbsL ||
26672687
opc == Op_NegF || opc == Op_NegD ||
26682688
opc == Op_RoundF || opc == Op_RoundD ||
2669-
opc == Op_PopCountI || opc == Op_PopCountL ||
26702689
opc == Op_ReverseBytesI || opc == Op_ReverseBytesL ||
26712690
opc == Op_ReverseBytesUS || opc == Op_ReverseBytesS ||
26722691
opc == Op_ReverseI || opc == Op_ReverseL ||
2673-
opc == Op_CountLeadingZerosI || opc == Op_CountLeadingZerosL ||
2674-
opc == Op_CountTrailingZerosI || opc == Op_CountTrailingZerosL) {
2692+
opc == Op_PopCountI || opc == Op_CountLeadingZerosI ||
2693+
opc == Op_CountTrailingZerosI) {
26752694
assert(n->req() == 2, "only one input expected");
26762695
Node* in = vector_opd(p, 1);
26772696
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
26782697
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
2698+
} else if (requires_long_to_int_conversion(opc)) {
2699+
// Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
2700+
// returns int type, but Vector API for them returns long type. To unify
2701+
// the implementation in backend, superword splits the vector implementation
2702+
// for Java API into an execution node with long type plus another node
2703+
// converting long to int.
2704+
assert(n->req() == 2, "only one input expected");
2705+
Node* in = vector_opd(p, 1);
2706+
Node* longval = VectorNode::make(opc, in, NULL, vlen, T_LONG);
2707+
_igvn.register_new_node_with_optimizer(longval);
2708+
_phase->set_ctrl(longval, _phase->get_ctrl(p->at(0)));
2709+
vn = VectorCastNode::make(Op_VectorCastL2X, longval, T_INT, vlen);
2710+
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
26792711
} else if (VectorNode::is_convert_opcode(opc)) {
26802712
assert(n->req() == 2, "only one input expected");
26812713
BasicType bt = velt_basic_type(n);
@@ -3198,27 +3230,11 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
31983230
return true;
31993231
}
32003232

3201-
if (VectorNode::is_type_transition_long_to_int(use)) {
3202-
// PopCountL/CountLeadingZerosL/CountTrailingZerosL takes long and produces
3203-
// int - hence the special checks on alignment and size.
3204-
if (u_pk->size() != d_pk->size()) {
3205-
return false;
3206-
}
3207-
for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
3208-
Node* ui = u_pk->at(i);
3209-
Node* di = d_pk->at(i);
3210-
if (alignment(ui) * 2 != alignment(di)) {
3211-
return false;
3212-
}
3213-
}
3214-
return true;
3215-
}
3216-
32173233
if (u_pk->size() != d_pk->size())
32183234
return false;
32193235

32203236
if (longer_type_for_conversion(use) != T_ILLEGAL) {
3221-
// type conversion takes a type of a kind of size and produces a type of
3237+
// These opcodes take a type of a kind of size and produce a type of
32223238
// another size - hence the special checks on alignment and size.
32233239
for (uint i = 0; i < u_pk->size(); i++) {
32243240
Node* ui = u_pk->at(i);
@@ -3467,7 +3483,8 @@ void SuperWord::compute_max_depth() {
34673483
}
34683484

34693485
BasicType SuperWord::longer_type_for_conversion(Node* n) {
3470-
if (!VectorNode::is_convert_opcode(n->Opcode()) ||
3486+
if (!(VectorNode::is_convert_opcode(n->Opcode()) ||
3487+
requires_long_to_int_conversion(n->Opcode())) ||
34713488
!in_bb(n->in(1))) {
34723489
return T_ILLEGAL;
34733490
}

src/hotspot/share/opto/superword.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,7 @@ class SuperWord : public ResourceObj {
457457
bool is_cmov_pack(Node_List* p);
458458
bool is_cmov_pack_internal_node(Node_List* p, Node* nd) { return is_cmov_pack(p) && !nd->is_CMove(); }
459459
static bool is_cmove_fp_opcode(int opc) { return (opc == Op_CMoveF || opc == Op_CMoveD); }
460+
static bool requires_long_to_int_conversion(int opc);
460461
// For pack p, are all idx operands the same?
461462
bool same_inputs(Node_List* p, int idx);
462463
// CloneMap utilities

src/hotspot/share/opto/vectornode.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -343,17 +343,6 @@ bool VectorNode::is_muladds2i(Node* n) {
343343
return false;
344344
}
345345

346-
bool VectorNode::is_type_transition_long_to_int(Node* n) {
347-
switch(n->Opcode()) {
348-
case Op_PopCountL:
349-
case Op_CountLeadingZerosL:
350-
case Op_CountTrailingZerosL:
351-
return true;
352-
default:
353-
return false;
354-
}
355-
}
356-
357346
bool VectorNode::is_roundopD(Node* n) {
358347
if (n->Opcode() == Op_RoundDoubleMode) {
359348
return true;

src/hotspot/share/opto/vectornode.hpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,6 @@ class VectorNode : public TypeNode {
9999
static bool is_type_transition_short_to_int(Node* n);
100100
static bool is_type_transition_to_int(Node* n);
101101
static bool is_muladds2i(Node* n);
102-
static bool is_type_transition_long_to_int(Node* n);
103102
static bool is_roundopD(Node* n);
104103
static bool is_scalar_rotate(Node* n);
105104
static bool is_vector_rotate_supported(int opc, uint vlen, BasicType bt);
@@ -551,7 +550,9 @@ class PopCountVINode : public VectorNode {
551550
// Vector popcount long bits
552551
class PopCountVLNode : public VectorNode {
553552
public:
554-
PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
553+
PopCountVLNode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {
554+
assert(vt->element_basic_type() == T_LONG, "must be long");
555+
}
555556
virtual int Opcode() const;
556557
};
557558

@@ -1732,15 +1733,21 @@ class RotateLeftVNode : public VectorNode {
17321733
class CountLeadingZerosVNode : public VectorNode {
17331734
public:
17341735
CountLeadingZerosVNode(Node* in, const TypeVect* vt)
1735-
: VectorNode(in, vt) {}
1736+
: VectorNode(in, vt) {
1737+
assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
1738+
"must be the same");
1739+
}
17361740

17371741
virtual int Opcode() const;
17381742
};
17391743

17401744
class CountTrailingZerosVNode : public VectorNode {
17411745
public:
17421746
CountTrailingZerosVNode(Node* in, const TypeVect* vt)
1743-
: VectorNode(in, vt) {}
1747+
: VectorNode(in, vt) {
1748+
assert(in->bottom_type()->is_vect()->element_basic_type() == vt->element_basic_type(),
1749+
"must be the same");
1750+
}
17441751

17451752
virtual int Opcode() const;
17461753
};

0 commit comments

Comments
 (0)