Skip to content
This repository has been archived by the owner on Sep 19, 2023. It is now read-only.
/ jdk21 Public archive

Commit

Permalink
8310459: [BACKOUT] 8304450: [vectorapi] Refactor VectorShuffle implem…
Browse files Browse the repository at this point in the history
…entation

Reviewed-by: kvn
Backport-of: ff9a7541097bd853306a8594c97774f36877a0f9
  • Loading branch information
TobiHartmann committed Jun 28, 2023
1 parent 3248dae commit 9998c07
Show file tree
Hide file tree
Showing 64 changed files with 2,076 additions and 2,726 deletions.
40 changes: 35 additions & 5 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
Expand Up @@ -315,10 +315,6 @@ source %{
}
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

// Assert that the given node is not a variable shift.
bool assert_not_var_shift(const Node* n) {
assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
Expand Down Expand Up @@ -6162,6 +6158,41 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector shuffle -------------------------------

instruct loadshuffle(vReg dst, vReg src) %{
match(Set dst (VectorLoadShuffle src));
format %{ "loadshuffle $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (bt == T_BYTE) {
if ($dst$$FloatRegister != $src$$FloatRegister) {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
$src$$FloatRegister, $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
}
}
} else {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S/8S, 4I, 4F
__ uxtl($dst$$FloatRegister, __ T8H, $src$$FloatRegister, __ T8B);
if (type2aelembytes(bt) == 4) {
__ uxtl($dst$$FloatRegister, __ T4S, $dst$$FloatRegister, __ T4H);
}
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
$src$$FloatRegister, __ B);
}
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
Expand All @@ -6184,7 +6215,6 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Expand Down
40 changes: 35 additions & 5 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,6 @@ source %{
}
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

// Assert that the given node is not a variable shift.
bool assert_not_var_shift(const Node* n) {
assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
Expand Down Expand Up @@ -4428,6 +4424,41 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector shuffle -------------------------------

instruct loadshuffle(vReg dst, vReg src) %{
match(Set dst (VectorLoadShuffle src));
format %{ "loadshuffle $dst, $src" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
if (bt == T_BYTE) {
if ($dst$$FloatRegister != $src$$FloatRegister) {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
__ orr($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
$src$$FloatRegister, $src$$FloatRegister);
} else {
assert(UseSVE > 0, "must be sve");
__ sve_orr($dst$$FloatRegister, $src$$FloatRegister, $src$$FloatRegister);
}
}
} else {
if (VM_Version::use_neon_for_vector(length_in_bytes)) {
// 4S/8S, 4I, 4F
__ uxtl($dst$$FloatRegister, __ T8H, $src$$FloatRegister, __ T8B);
if (type2aelembytes(bt) == 4) {
__ uxtl($dst$$FloatRegister, __ T4S, $dst$$FloatRegister, __ T4H);
}
} else {
assert(UseSVE > 0, "must be sve");
__ sve_vector_extend($dst$$FloatRegister, __ elemType_to_regVariant(bt),
$src$$FloatRegister, __ B);
}
}
%}
ins_pipe(pipe_slow);
%}

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
Expand All @@ -4450,7 +4481,6 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/arm/arm.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1025,10 +1025,6 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/ppc/ppc.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2189,10 +2189,6 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
5 changes: 1 addition & 4 deletions src/hotspot/cpu/riscv/riscv_v.ad
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,6 @@ source %{
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}
%}

definitions %{
Expand Down Expand Up @@ -4035,4 +4032,4 @@ instruct vtest_alltrue_branch(cmpOpEqNe cop, vRegMask op1, vRegMask op2, label l
__ enc_cmpEqNe_imm0_branch($cop$$cmpcode, t0, *($lbl$$label), /* is_far */ true);
%}
ins_pipe(pipe_slow);
%}
%}
4 changes: 0 additions & 4 deletions src/hotspot/cpu/s390/s390.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1529,10 +1529,6 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
return false;
}

const RegMask* Matcher::predicate_reg_mask(void) {
return NULL;
}
Expand Down
77 changes: 59 additions & 18 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -2151,19 +2151,6 @@ const bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect*
return false;
}

// Return true if Vector::rearrange needs preparation of the shuffle argument
const bool Matcher::vector_needs_load_shuffle(BasicType elem_bt, int vlen) {
switch (elem_bt) {
case T_BYTE: return false;
case T_SHORT: return !VM_Version::supports_avx512bw();
case T_INT: return !VM_Version::supports_avx();
case T_LONG: return vlen < 8 && !VM_Version::supports_avx512vl();
default:
ShouldNotReachHere();
return false;
}
}

MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
assert(Matcher::is_generic_vector(generic_opnd), "not generic");
bool legacy = (generic_opnd->opcode() == LEGVEC);
Expand Down Expand Up @@ -8340,6 +8327,17 @@ instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
//-------------------------------- Rearrange ----------------------------------

// LoadShuffle/Rearrange for Byte

instruct loadShuffleB(vec dst) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
match(Set dst (VectorLoadShuffle dst));
format %{ "vector_load_shuffle $dst, $dst" %}
ins_encode %{
// empty
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeB(vec dst, vec shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE &&
Matcher::vector_length(n) < 32);
Expand Down Expand Up @@ -8406,7 +8404,7 @@ instruct rearrangeB_evex_vbmi(vec dst, vec src, vec shuffle) %{

instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
!VM_Version::supports_avx512bw());
Matcher::vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
match(Set dst (VectorLoadShuffle src));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_load_shuffle $dst, $src\t! using $vtmp as TEMP" %}
Expand All @@ -8417,7 +8415,7 @@ instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
if (UseAVX == 0) {
assert(vlen_in_bytes <= 16, "required");
// Multiply each shuffle by two to get byte index
__ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
__ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
__ psllw($vtmp$$XMMRegister, 1);

// Duplicate to create 2 copies of byte index
Expand All @@ -8432,7 +8430,8 @@ instruct loadShuffleS(vec dst, vec src, vec vtmp) %{
assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
int vlen_enc = vector_length_encoding(this);
// Multiply each shuffle by two to get byte index
__ vpsllw($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
__ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);

// Duplicate to create 2 copies of byte index
__ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister, 8, vlen_enc);
Expand Down Expand Up @@ -8479,6 +8478,21 @@ instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVe
ins_pipe( pipe_slow );
%}

instruct loadShuffleS_evex(vec dst, vec src) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
VM_Version::supports_avx512bw());
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
if (!VM_Version::supports_avx512vl()) {
vlen_enc = Assembler::AVX_512bit;
}
__ vpmovzxbw($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeS_evex(vec dst, vec src, vec shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_SHORT &&
VM_Version::supports_avx512bw());
Expand Down Expand Up @@ -8509,7 +8523,7 @@ instruct loadShuffleI(vec dst, vec src, vec vtmp) %{
// only byte shuffle instruction available on these platforms

// Duplicate and multiply each shuffle by 4
__ movdqu($vtmp$$XMMRegister, $src$$XMMRegister);
__ pmovzxbd($vtmp$$XMMRegister, $src$$XMMRegister);
__ pshuflw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
__ pshufhw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 0xA0);
__ psllw($vtmp$$XMMRegister, 2);
Expand Down Expand Up @@ -8538,6 +8552,18 @@ instruct rearrangeI(vec dst, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct loadShuffleI_avx(vec dst, vec src) %{
predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
UseAVX > 0);
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vpmovzxbd($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeI_avx(vec dst, vec src, vec shuffle) %{
predicate((Matcher::vector_element_basic_type(n) == T_INT || Matcher::vector_element_basic_type(n) == T_FLOAT) &&
UseAVX > 0);
Expand Down Expand Up @@ -8567,7 +8593,8 @@ instruct loadShuffleL(vec dst, vec src, vec vtmp) %{
// only double word shuffle instruction available on these platforms

// Multiply each shuffle by two to get double word index
__ vpsllq($vtmp$$XMMRegister, $src$$XMMRegister, 1, vlen_enc);
__ vpmovzxbq($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vpsllq($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);

// Duplicate each double word shuffle
__ vpsllq($dst$$XMMRegister, $vtmp$$XMMRegister, 32, vlen_enc);
Expand All @@ -8593,6 +8620,20 @@ instruct rearrangeL(vec dst, vec src, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct loadShuffleL_evex(vec dst, vec src) %{
predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
(Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
match(Set dst (VectorLoadShuffle src));
format %{ "vector_load_shuffle $dst, $src" %}
ins_encode %{
assert(UseAVX > 2, "required");

int vlen_enc = vector_length_encoding(this);
__ vpmovzxbq($dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct rearrangeL_evex(vec dst, vec src, vec shuffle) %{
predicate(is_double_word_type(Matcher::vector_element_basic_type(n)) && // T_LONG, T_DOUBLE
(Matcher::vector_length(n) == 8 || VM_Version::supports_avx512vl()));
Expand Down
18 changes: 18 additions & 0 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -965,6 +965,24 @@ class methodHandle;
"Ljdk/internal/vm/vector/VectorSupport$VectorPayload;") \
do_name(vector_frombits_coerced_name, "fromBitsCoerced") \
\
do_intrinsic(_VectorShuffleIota, jdk_internal_vm_vector_VectorSupport, vector_shuffle_step_iota_name, vector_shuffle_step_iota_sig, F_S) \
do_signature(vector_shuffle_step_iota_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;" \
"IIII" \
"Ljdk/internal/vm/vector/VectorSupport$ShuffleIotaOperation;)" \
"Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;") \
do_name(vector_shuffle_step_iota_name, "shuffleIota") \
\
do_intrinsic(_VectorShuffleToVector, jdk_internal_vm_vector_VectorSupport, vector_shuffle_to_vector_name, vector_shuffle_to_vector_sig, F_S) \
do_signature(vector_shuffle_to_vector_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorShuffle;" \
"ILjdk/internal/vm/vector/VectorSupport$ShuffleToVectorOperation;)" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;") \
do_name(vector_shuffle_to_vector_name, "shuffleToVector") \
\
do_intrinsic(_VectorLoadOp, jdk_internal_vm_vector_VectorSupport, vector_load_op_name, vector_load_op_sig, F_S) \
do_signature(vector_load_op_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/c2compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -754,6 +754,8 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method) {
case vmIntrinsics::_VectorBinaryOp:
case vmIntrinsics::_VectorTernaryOp:
case vmIntrinsics::_VectorFromBitsCoerced:
case vmIntrinsics::_VectorShuffleIota:
case vmIntrinsics::_VectorShuffleToVector:
case vmIntrinsics::_VectorLoadOp:
case vmIntrinsics::_VectorLoadMaskedOp:
case vmIntrinsics::_VectorStoreOp:
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/opto/graphKit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -904,7 +904,7 @@ class GraphKit : public Phase {

// Vector API support (implemented in vectorIntrinsics.cpp)
Node* box_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem, bool deoptimize_on_exception = false);
Node* unbox_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem);
Node* unbox_vector(Node* in, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem, bool shuffle_to_vector = false);
Node* vector_shift_count(Node* cnt, int shift_op, BasicType bt, int num_elem);
};

Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,8 +698,12 @@ bool LibraryCallKit::try_to_inline(int predicate) {
return inline_vector_nary_operation(3);
case vmIntrinsics::_VectorFromBitsCoerced:
return inline_vector_frombits_coerced();
case vmIntrinsics::_VectorShuffleIota:
return inline_vector_shuffle_iota();
case vmIntrinsics::_VectorMaskOp:
return inline_vector_mask_operation();
case vmIntrinsics::_VectorShuffleToVector:
return inline_vector_shuffle_to_vector();
case vmIntrinsics::_VectorLoadOp:
return inline_vector_mem_operation(/*is_store=*/false);
case vmIntrinsics::_VectorLoadMaskedOp:
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/opto/library_call.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ class LibraryCallKit : public GraphKit {
// Vector API support
bool inline_vector_nary_operation(int n);
bool inline_vector_frombits_coerced();
bool inline_vector_shuffle_to_vector();
bool inline_vector_shuffle_iota();
bool inline_vector_mask_operation();
bool inline_vector_mem_operation(bool is_store);
bool inline_vector_mem_masked_operation(bool is_store);
Expand Down
2 changes: 0 additions & 2 deletions src/hotspot/share/opto/matcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,6 @@ class Matcher : public PhaseTransform {

static const bool vector_needs_partial_operations(Node* node, const TypeVect* vt);

static const bool vector_needs_load_shuffle(BasicType elem_bt, int vlen);

static const RegMask* predicate_reg_mask(void);
static const TypeVectMask* predicate_reg_type(const Type* elemTy, int length);

Expand Down
Loading

1 comment on commit 9998c07

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.