Skip to content

Commit

Permalink
8304450: [vectorapi] Refactor VectorShuffle implementation
Browse files Browse the repository at this point in the history
Reviewed-by: psandoz, xgong, jbhateja, vlivanov
  • Loading branch information
Quan Anh Mai committed Apr 13, 2023
1 parent 3f36dd8 commit e846a1d
Show file tree
Hide file tree
Showing 64 changed files with 2,521 additions and 2,075 deletions.
40 changes: 5 additions & 35 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,10 @@ source %{
}
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

// Assert that the given node is not a variable shift.
bool assert_not_var_shift(const Node* n) {
assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
Expand Down Expand Up @@ -6065,41 +6069,6 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector shuffle -------------------------------

instruct loadshuffle(vReg dst, vReg src) %{
match(Set dst (VectorLoadShuffle src));
format %{ "loadshuffle $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (bt == T_BYTE) {
if ($dst$$FloatRegister != $src$$FloatRegister) {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
$src$$FloatRegister, $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
}
}
} else {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S/8S, 4I, 4F
__ uxtl($dst$$FloatRegister, __ T8H, $src$$FloatRegister, __ T8B);
if (type2aelembytes(bt) == 4) {
__ uxtl($dst$$FloatRegister, __ T4S, $dst$$FloatRegister, __ T4H);
}
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
$src$$FloatRegister, __ B);
}
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
Expand All @@ -6122,6 +6091,7 @@ instruct loadshuffle(vReg dst, vReg src) %{
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Expand Down
40 changes: 5 additions & 35 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,10 @@ source %{
}
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

// Assert that the given node is not a variable shift.
bool assert_not_var_shift(const Node* n) {
assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
Expand Down Expand Up @@ -4418,41 +4422,6 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector shuffle -------------------------------

instruct loadshuffle(vReg dst, vReg src) %{
match(Set dst (VectorLoadShuffle src));
format %{ "loadshuffle $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (bt == T_BYTE) {
if ($dst$$FloatRegister != $src$$FloatRegister) {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
$src$$FloatRegister, $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
}
}
} else {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S/8S, 4I, 4F
__ uxtl($dst$$FloatRegister, __ T8H, $src$$FloatRegister, __ T8B);
if (type2aelembytes(bt) == 4) {
__ uxtl($dst$$FloatRegister, __ T4S, $dst$$FloatRegister, __ T4H);
}
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
$src$$FloatRegister, __ B);
}
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
Expand All @@ -4475,6 +4444,7 @@ instruct loadshuffle(vReg dst, vReg src) %{
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/arm/arm.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1025,6 +1025,10 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/ppc/ppc.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2189,6 +2189,10 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/riscv/riscv.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1870,6 +1870,10 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/s390/s390.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1529,6 +1529,10 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
77 changes: 18 additions & 59 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2171,6 +2171,19 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

// Return true if Vector::rearrange needs preparation of the shuffle argument
const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
switch (elem_bt) {
case T_BYTE: return false;
case T_SHORT: return !VM_Version::supports_avx512bw();
case T_INT: return !VM_Version::supports_avx();
case T_LONG: return vlen < 8 && !VM_Version::supports_avx512vl();
default:
ShouldNotReachHere();
return false;
}
}

MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
assert(Matcher::is_generic_vector(generic_opnd), "not generic");
bool legacy = (generic_opnd->opcode() == LEGVEC);
Expand Down Expand Up @@ -8406,17 +8419,6 @@ instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
//-------------------------------- Rearrange ----------------------------------

// LoadShuffle/Rearrange for Byte

instruct loadShuffleB(vec dst) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
match(Set dst (VectorLoadShuffle dst));
format %{ "vector_load_shuffle $dst, $dst" %}
ins_encode %{
// empty
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeB(vec dst, vec shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
Matcher::vector_length(n) < 32);
Expand Down Expand Up @@ -8483,7 +8485,7 @@ instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{

instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
!VM_Version::supports_avx512bw());
match(Set dst (VectorLoadShuffle src));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
Expand All @@ -8494,7 +8496,7 @@ instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
if (UseAVX == 0) {
assert(vlen_in_bytes <= 16, "required");
// Multiply each shuffle by two to get byte index
__ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
__ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
__ psllw($vtmp$$XMMRegister, 1);

// Duplicate to create 2 copies of byte index
Expand All @@ -8509,8 +8511,7 @@ instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
int vlen_enc = vector_length_encoding(this);
// Multiply each shuffle by two to get byte index
__ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
__ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);

// Duplicate to create 2 copies of byte index
__ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister, 8, vlen_enc);
Expand Down Expand Up @@ -8557,21 +8558,6 @@ instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVe
ins_pipe( pipe_slow );
%}

instruct loadShuffleS_evex(vec dst, vec src) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
VM_Version::supports_avx512bw());
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
if (!VM_Version::supports_avx512vl()) {
vlen_enc = Assembler::AVX_512bit;
}
__ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
VM_Version::supports_avx512bw());
Expand Down Expand Up @@ -8602,7 +8588,7 @@ instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
// only byte shuffle instruction available on these platforms

// Duplicate and multiply each shuffle by 4
__ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
__ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
__ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
__ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
__ psllw($vtmp$$XMMRegister, 2);
Expand Down Expand Up @@ -8631,18 +8617,6 @@ instruct rearrangeI(vec dst, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct loadShuffleI_avx(vec dst, vec src) %{
predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
UseAVX > 0);
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
UseAVX > 0);
Expand Down Expand Up @@ -8672,8 +8646,7 @@ instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
// only double word shuffle instruction available on these platforms

// Multiply each shuffle by two to get double word index
__ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
__ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);

// Duplicate each double word shuffle
__ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
Expand All @@ -8699,20 +8672,6 @@ instruct rearrangeL(vec dst, vec src, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct loadShuffleL_evex(vec dst, vec src) %{
predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
(Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
assert(UseAVX > 2, "required");

int vlen_enc = vector_length_encoding(this);
__ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
(Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
Expand Down
18 changes: 0 additions & 18 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,24 +963,6 @@ class methodHandle;
"Ljdk/internal/vm/vector/VectorSupport$VectorPayload;") \
do_name(vector_frombits_coerced_name, "fromBitsCoerced") \
\
do_intrinsic(_VectorShuffleIota, jdk_internal_vm_vector_VectorSupport, vector_shuffle_step_iota_name, vector_shuffle_step_iota_sig, F_S) \
do_signature(vector_shuffle_step_iota_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;" \
"IIII" \
"Ljdk/internal/vm/vector/VectorSupport$ShuffleIotaOperation;)" \
"Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;") \
do_name(vector_shuffle_step_iota_name, "shuffleIota") \
\
do_intrinsic(_VectorShuffleToVector, jdk_internal_vm_vector_VectorSupport, vector_shuffle_to_vector_name, vector_shuffle_to_vector_sig, F_S) \
do_signature(vector_shuffle_to_vector_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;" \
"ILjdk/internal/vm/vector/VectorSupport$ShuffleToVectorOperation;)" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;") \
do_name(vector_shuffle_to_vector_name, "shuffleToVector") \
\
do_intrinsic(_VectorLoadOp, jdk_internal_vm_vector_VectorSupport, vector_load_op_name, vector_load_op_sig, F_S) \
do_signature(vector_load_op_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
Expand Down
2 changes: 0 additions & 2 deletions src/hotspot/share/opto/c2compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -754,8 +754,6 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) {
case vmIntrinsics::_VectorBinaryOp:
case vmIntrinsics::_VectorTernaryOp:
case vmIntrinsics::_VectorFromBitsCoerced:
case vmIntrinsics::_VectorShuffleIota:
case vmIntrinsics::_VectorShuffleToVector:
case vmIntrinsics::_VectorLoadOp:
case vmIntrinsics::_VectorLoadMaskedOp:
case vmIntrinsics::_VectorStoreOp:
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/opto/graphKit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ class GraphKit : public Phase {

// Vector API support (implemented in vectorIntrinsics.cpp)
Node* box_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem, bool deoptimize_on_exception = false);
Node* unbox_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem, bool shuffle_to_vector = false);
Node* unbox_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem);
Node* vector_shift_count(Node* cnt, int shift_op, BasicType bt, int num_elem);
};

Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,12 +695,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
return inline_vector_nary_operation(3);
case vmIntrinsics::_VectorFromBitsCoerced:
return inline_vector_frombits_coerced();
case vmIntrinsics::_VectorShuffleIota:
return inline_vector_shuffle_iota();
case vmIntrinsics::_VectorMaskOp:
return inline_vector_mask_operation();
case vmIntrinsics::_VectorShuffleToVector:
return inline_vector_shuffle_to_vector();
case vmIntrinsics::_VectorLoadOp:
return inline_vector_mem_operation(/*is_store=*/false);
case vmIntrinsics::_VectorLoadMaskedOp:
Expand Down
2 changes: 0 additions & 2 deletions src/hotspot/share/opto/library_call.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,8 +345,6 @@ class LibraryCallKit : public GraphKit {
// Vector API support
bool inline_vector_nary_operation(int n);
bool inline_vector_frombits_coerced();
bool inline_vector_shuffle_to_vector();
bool inline_vector_shuffle_iota();
bool inline_vector_mask_operation();
bool inline_vector_mem_operation(bool is_store);
bool inline_vector_mem_masked_operation(bool is_store);
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/matcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,8 @@ class Matcher : public PhaseTransform {

static const bool vector_needs_partial_operations(Node* node, const TypeVect* vt);

static const bool vector_needs_load_shuffle(BasicType elem_bt, int vlen);

static const RegMask* predicate_reg_mask(void);
static const TypeVectMask* predicate_reg_type(const Type* elemTy, int length);

Expand Down
Loading

1 comment on commit e846a1d

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.