Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions src/hotspot/cpu/aarch64/matcher_aarch64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,6 @@
return is_alltrue ? BoolTest::eq : BoolTest::ne;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
return false;
}

// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/arm/matcher_arm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,10 +170,6 @@
return BoolTest::illegal;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
return false;
}

// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/ppc/matcher_ppc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,6 @@
return BoolTest::illegal;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
return false;
}

// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/riscv/matcher_riscv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@
return is_alltrue ? BoolTest::eq : BoolTest::ne;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
return false;
}

// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
Expand Down
4 changes: 0 additions & 4 deletions src/hotspot/cpu/s390/matcher_s390.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,10 +168,6 @@
return BoolTest::illegal;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
return false;
}

// Returns pre-selection estimated size of a vector operation.
static int vector_op_pre_select_sz_estimate(int vopc, BasicType ety, int vlen) {
switch(vopc) {
Expand Down
2 changes: 0 additions & 2 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6459,11 +6459,9 @@ void C2_MacroAssembler::select_from_two_vectors_evex(BasicType elem_bt, XMMRegis
evpermi2q(dst, src1, src2, vlen_enc);
break;
case T_FLOAT:
vcvttps2dq(dst, dst, vlen_enc);
evpermi2ps(dst, src1, src2, vlen_enc);
break;
case T_DOUBLE:
evcvttpd2qq(dst, dst, vlen_enc);
evpermi2pd(dst, src1, src2, vlen_enc);
break;
default:
Expand Down
12 changes: 0 additions & 12 deletions src/hotspot/cpu/x86/matcher_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,18 +212,6 @@
return BoolTest::lt;
}

static bool vector_indexes_needs_pruning(BasicType bt, int vlen) {
switch(bt) {
default:
return false;
case T_SHORT:
return !VM_Version::supports_avx512bw();
case T_LONG:
case T_DOUBLE:
return !VM_Version::supports_avx512vl();
}
}

// Returns pre-selection estimated size of a vector operation.
// Currently, it's a rudimentary heuristic based on emitted code size for complex
// IR nodes used by unroll policy. Idea is to constrain unrolling factor and prevent
Expand Down
5 changes: 1 addition & 4 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1945,10 +1945,7 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (bt == T_BYTE && !VM_Version::supports_avx512_vbmi()) {
return false;
}
if ((bt == T_INT || bt == T_FLOAT) && !VM_Version::supports_evex()) {
return false;
}
if (bt == T_DOUBLE && !VM_Version::supports_avx512dq()) {
if ((bt == T_INT || bt == T_FLOAT || bt == T_DOUBLE) && !VM_Version::supports_evex()) {
return false;
}
break;
Expand Down
2 changes: 2 additions & 0 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,8 @@ class methodHandle;
\
do_intrinsic(_VectorSelectFromTwoVectorOp, jdk_internal_vm_vector_VectorSupport, vector_select_from_op_name, vector_select_from_op_sig, F_S) \
do_signature(vector_select_from_op_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"I" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
Expand Down
80 changes: 51 additions & 29 deletions src/hotspot/share/opto/vectorIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2632,24 +2632,32 @@ bool LibraryCallKit::inline_vector_extract() {
}


// public static
// <V extends Vector<E>,
// E>
// V selectFromTwoVectorOp(Class<? extends V> vClass, Class<E> eClass, int length,
// V v1, V v2, V v3,
// SelectFromTwoVector<V> defaultImpl)
// public static
// <V extends Vector<E1>,
// VI extends Vector<E2>,
// E1,
// E2>
// V selectFromTwoVectorOp(Class<? extends V> vClass, Class<? extends VI> viClass,
// Class<E1> eClass, Class<E2> iClass, int length,
// VI v1, V v2, V v3,
// SelectFromTwoVector<V> defaultImpl)
bool LibraryCallKit::inline_vector_select_from_two_vectors() {
const TypeInstPtr* vector_klass = gvn().type(argument(0))->isa_instptr();
const TypeInstPtr* elem_klass = gvn().type(argument(1))->isa_instptr();
const TypeInt* vlen = gvn().type(argument(2))->isa_int();

if (vector_klass == nullptr || elem_klass == nullptr || vlen == nullptr ||
vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr ||
!vlen->is_con()) {
log_if_needed(" ** missing constant: vclass=%s etype=%s vlen=%s",
const TypeInstPtr* index_vector_klass = gvn().type(argument(1))->isa_instptr();
const TypeInstPtr* elem_klass = gvn().type(argument(2))->isa_instptr();
const TypeInstPtr* index_elem_klass = gvn().type(argument(3))->isa_instptr();
const TypeInt* vlen = gvn().type(argument(4))->isa_int();

if (vector_klass == nullptr || index_vector_klass == nullptr || elem_klass == nullptr ||
index_elem_klass == nullptr || vlen == nullptr || vector_klass->const_oop() == nullptr ||
index_vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr ||
index_elem_klass->const_oop() == nullptr || !vlen->is_con()) {
log_if_needed(" ** missing constant: vclass=%s viclass = %s etype=%s itype = %s vlen=%s",
NodeClassNames[argument(0)->Opcode()],
NodeClassNames[argument(1)->Opcode()],
NodeClassNames[argument(2)->Opcode()]);
NodeClassNames[argument(2)->Opcode()],
NodeClassNames[argument(3)->Opcode()],
NodeClassNames[argument(4)->Opcode()]);
return false; // not enough info for intrinsification
}

Expand All @@ -2658,50 +2666,64 @@ bool LibraryCallKit::inline_vector_select_from_two_vectors() {
return false;
}

if (!is_klass_initialized(index_vector_klass)) {
log_if_needed(" ** klass argument not initialized");
return false;
}

ciType* elem_type = elem_klass->const_oop()->as_instance()->java_mirror_type();
if (!elem_type->is_primitive_type()) {
log_if_needed(" ** not a primitive bt=%d", elem_type->basic_type());
return false; // should be primitive type
}

ciType* index_elem_type = index_elem_klass->const_oop()->as_instance()->java_mirror_type();
if (!index_elem_type->is_primitive_type()) {
log_if_needed(" ** index element not a primitive bt=%d", index_elem_type->basic_type());
return false; // should be primitive type
}

int num_elem = vlen->get_con();
BasicType elem_bt = elem_type->basic_type();
BasicType index_elem_bt = index_elem_type->basic_type();
assert(!is_floating_point_type(index_elem_bt), "floating point index element type");

if (!arch_supports_vector(Op_SelectFromTwoVector, num_elem, elem_bt, VecMaskNotUsed)) {
int opc = VectorSupport::vop2ideal(VectorSupport::VECTOR_OP_SUB, elem_bt);
int sopc = VectorNode::opcode(opc, elem_bt);
if (!arch_supports_vector(Op_VectorMaskCmp, num_elem, elem_bt, VecMaskNotUsed) ||
!arch_supports_vector(Op_VectorBlend, num_elem, elem_bt, VecMaskUseLoad) ||
!arch_supports_vector(Op_VectorRearrange, num_elem, elem_bt, VecMaskNotUsed) ||
(!is_integral_type(elem_bt) &&
((elem_bt == T_FLOAT && !arch_supports_vector(Op_VectorCastF2X, num_elem, T_INT, VecMaskNotUsed)) ||
(elem_bt == T_DOUBLE && !arch_supports_vector(Op_VectorCastD2X, num_elem, T_LONG, VecMaskNotUsed)))) ||
!arch_supports_vector(sopc, num_elem, elem_bt, VecMaskNotUsed)) {
int cast_vopc = VectorCastNode::opcode(-1, index_elem_bt, true);
if (!arch_supports_vector(Op_VectorMaskCmp, num_elem, T_BYTE, VecMaskNotUsed) ||
!arch_supports_vector(Op_AndV, num_elem, T_BYTE, VecMaskNotUsed) ||
!arch_supports_vector(Op_VectorBlend, num_elem, elem_bt, VecMaskUseLoad) ||
!arch_supports_vector(Op_VectorRearrange, num_elem, elem_bt, VecMaskNotUsed) ||
!arch_supports_vector(cast_vopc, num_elem, T_BYTE, VecMaskNotUsed) ||
!arch_supports_vector(Op_VectorLoadShuffle, num_elem, index_elem_bt, VecMaskNotUsed) ||
!arch_supports_vector(Op_Replicate, num_elem, T_BYTE, VecMaskNotUsed)) {
log_if_needed(" ** not supported: opc=%d vlen=%d etype=%s ismask=useload",
Op_SelectFromTwoVector, num_elem, type2name(elem_bt));
return false; // not supported
}
}

ciKlass* vbox_klass = vector_klass->const_oop()->as_instance()->java_lang_Class_klass();
ciKlass* index_vbox_klass = index_vector_klass->const_oop()->as_instance()->java_lang_Class_klass();
const TypeInstPtr* vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass);
const TypeInstPtr* index_vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, index_vbox_klass);

Node* opd1 = unbox_vector(argument(3), vbox_type, elem_bt, num_elem);
Node* opd1 = unbox_vector(argument(5), index_vbox_type, index_elem_bt, num_elem);
if (opd1 == nullptr) {
log_if_needed(" ** unbox failed v1=%s",
NodeClassNames[argument(3)->Opcode()]);
NodeClassNames[argument(5)->Opcode()]);
return false;
}
Node* opd2 = unbox_vector(argument(4), vbox_type, elem_bt, num_elem);
Node* opd2 = unbox_vector(argument(6), vbox_type, elem_bt, num_elem);
if (opd2 == nullptr) {
log_if_needed(" ** unbox failed v1=%s",
NodeClassNames[argument(4)->Opcode()]);
NodeClassNames[argument(6)->Opcode()]);
return false;
}
Node* opd3 = unbox_vector(argument(5), vbox_type, elem_bt, num_elem);
Node* opd3 = unbox_vector(argument(7), vbox_type, elem_bt, num_elem);
if (opd3 == nullptr) {
log_if_needed(" ** unbox failed v1=%s",
NodeClassNames[argument(5)->Opcode()]);
NodeClassNames[argument(7)->Opcode()]);
return false;
}

Expand Down
104 changes: 32 additions & 72 deletions src/hotspot/share/opto/vectornode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2087,106 +2087,66 @@ Node* VectorBlendNode::Identity(PhaseGVN* phase) {
Node* SelectFromTwoVectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
int num_elem = vect_type()->length();
BasicType elem_bt = vect_type()->element_basic_type();

// Keep the node if it is supported, else lower it to other nodes.
if (Matcher::match_rule_supported_vector(Op_SelectFromTwoVector, num_elem, elem_bt)) {
return nullptr;
}

Node* index_vec = in(1);
Node* src1 = in(2);
Node* src2 = in(3);

// Lower the IR to constituents operations.
// SelectFromTwoVectorNode =
// (VectorBlend
// (VectorRearrange SRC1 INDEX)
// (VectorRearrange SRC2 NORM_INDEX)
// (VectorRearrange SRC1 (WRAPED_INDEX AND (VLEN-1))
// (VectorRearrange SRC2 (WRAPED_INDEX AND (VLEN-1))
// MASK)
// Where
// incoming WRAPED_INDEX is within two vector index range [0, VLEN*2) and
// MASK = WRAPED_INDEX < VLEN
//
// MASK = INDEX < num_elem
//
// This shall prevent an intrinsification failure and associated argument
// IR lowering prevents intrinsification failure and associated argument
// boxing penalties.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A quick comment about how the mask is computed could be nice.
MASK = INDEX < num_elem

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jatin-bhateja very nice, thanks!

// Here, MASK lanes corresponding to INDEX values greater than or equal to
// vector length (VELEN) are set and are used to select the elements from
// second source (SRC2) vector.
//

const Type* lane_count_type = nullptr;
switch(elem_bt) {
case T_BYTE:
case T_SHORT:
case T_INT:
case T_FLOAT:
lane_count_type = TypeInt::make(num_elem);
break;
case T_DOUBLE:
case T_LONG:
lane_count_type = TypeLong::make(num_elem);
break;
default:
fatal("Unsupported vectortype (%s)", type2name(elem_bt));
break;
}
const TypeVect* index_vect_type = index_vec->bottom_type()->is_vect();
BasicType index_elem_bt = index_vect_type->element_basic_type();
assert(!is_floating_point_type(index_elem_bt), "");

BasicType integral_elem_bt = elem_bt;
Node* integral_index_vec = index_vec;
if (elem_bt == T_FLOAT) {
integral_elem_bt = T_INT;
integral_index_vec = phase->transform(new VectorCastF2XNode(index_vec, TypeVect::make(integral_elem_bt, num_elem)));
} else if (elem_bt == T_DOUBLE) {
integral_elem_bt = T_LONG;
integral_index_vec = phase->transform(new VectorCastD2XNode(index_vec, TypeVect::make(integral_elem_bt, num_elem)));
}
// Downcast index vector to a type agnostic shuffle representation, shuffle indices
// are held in a byte vector which are later massaged to target specific permutation
// index format by subsequent VectorLoadShuffle.
int cast_vopc = VectorCastNode::opcode(0, index_elem_bt, true);
Node* index_byte_vec = phase->transform(VectorCastNode::make(cast_vopc, index_vec, T_BYTE, num_elem));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cast assumes that the indices cannot have more than 8 bits. This would allow vector lengths of up to 256. This is fine for intel. But as far as I know ARM has in principle longer vectors - up to 2048 bytes. Should we maybe add some assert here to make sure we never badly truncate the index?

Copy link
Member Author

@jatin-bhateja jatin-bhateja Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shuffle overhaul is on our todo list, its a know limitation which we tried lifting once, yes you read it correctly, its a limitation for AARCH64 SVE once a 2048 bits vector systems are available, IIRC current max vector size on any available AARCH64 system is 256 bits, with Neoverse V2 they shrink the vector size back to 16 bytes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there any asserts that would catch this?


int opc = VectorSupport::vop2ideal(VectorSupport::VECTOR_OP_SUB, integral_elem_bt);
int sopc = VectorNode::opcode(opc, integral_elem_bt);
Node* lane_cnt_m1 = phase->makecon(TypeInt::make(num_elem - 1));
Node* bcast_lane_cnt_m1_vec = phase->transform(VectorNode::scalar2vector(lane_cnt_m1, num_elem, Type::get_const_basic_type(T_BYTE), false));

BoolTest::mask pred = BoolTest::lt;
// Compute the blend mask for merging two indipendently permututed vectors
// using shuff index in two vector index range [0, VLEN * 2).
BoolTest::mask pred = BoolTest::le;
ConINode* pred_node = (ConINode*)phase->makecon(TypeInt::make(pred));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would as_ConI() be a better alternative to the (ConINode*) cast?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please at least add a comment why you are not following my suggestion. I feel like the work I put in to review is not being respected when comments are just silently resolved without any action or comment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really do think that as_ConI() would be the right thing here. In product it is just a cast, and in debug at least we have an assert.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DONE

Copy link
Member Author

@jatin-bhateja jatin-bhateja Sep 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It just got overlooked @eme64, we should always respect reviewer's suggestions and should value the time they invest in polishing our patches, thanks again :-)

Node* lane_cnt = phase->makecon(lane_count_type);
Node* bcast_lane_cnt_vec = phase->transform(VectorNode::scalar2vector(lane_cnt, num_elem, Type::get_const_basic_type(integral_elem_bt), false));
const TypeVect* vmask_type = TypeVect::makemask(T_BYTE, num_elem);
Node* mask = phase->transform(new VectorMaskCmpNode(pred, index_byte_vec, bcast_lane_cnt_m1_vec, pred_node, vmask_type));

// Rearrange expects the indexes to lie within single vector index range [0, VLEN).
index_byte_vec = phase->transform(VectorNode::make(Op_AndV, index_byte_vec, bcast_lane_cnt_m1_vec, index_byte_vec->bottom_type()->is_vect()));

// Comparison over integral vectors weeds out emitting additional
// instructions for checking special floating point values.
const TypeVect* vmask_type = TypeVect::makemask(integral_elem_bt, num_elem);
Node* mask = phase->transform(new VectorMaskCmpNode(pred, integral_index_vec, bcast_lane_cnt_vec, pred_node, vmask_type));
// Load indexes from byte vector and appropriatly massage them to target specific
// permutation index format.
index_vec = phase->transform(new VectorLoadShuffleNode(index_byte_vec, index_vect_type));

vmask_type = TypeVect::makemask(elem_bt, num_elem);
mask = phase->transform(new VectorMaskCastNode(mask, vmask_type));

Node* p1 = phase->transform(new VectorRearrangeNode(src1, integral_index_vec));
Node* normalized_index_vec = phase->transform(VectorNode::make(sopc, integral_index_vec, bcast_lane_cnt_vec, vect_type()));
Node* p2 = phase->transform(new VectorRearrangeNode(src2, normalized_index_vec));
Node* p1 = phase->transform(new VectorRearrangeNode(src1, index_vec));
Node* p2 = phase->transform(new VectorRearrangeNode(src2, index_vec));

return new VectorBlendNode(p2, p1, mask);
}

Node* VectorRearrangeNode::Ideal(PhaseGVN* phase, bool can_reshape) {
BasicType elem_bt = vect_type()->element_basic_type();
int num_elem = vect_type()->length();
if (in(2)->Opcode() != Op_VectorUnbox &&
in(2)->Opcode() != Op_VectorLoadShuffle &&
Matcher::match_rule_supported_vector(Op_VectorRearrange, num_elem, elem_bt) &&
Matcher::vector_indexes_needs_pruning(elem_bt, num_elem)) {

BasicType integral_elem_bt = elem_bt;
if (elem_bt == T_FLOAT) {
integral_elem_bt = T_INT;
} else if (elem_bt == T_DOUBLE) {
integral_elem_bt = T_LONG;
}

// Targets emulating unsupported permutation for certain vector types
// may need to massage the indexes to match the users intent.
// Lowering index vector to a bytevector followed by an explicit loadshuffle
// will bring the indexes in the consumable format.
int cast_opc = VectorCastNode::opcode(-1, elem_bt, true);
Node* pack_shuf = phase->transform(VectorCastNode::make(cast_opc, in(2), T_BYTE, num_elem));
const TypeVect* newvt = TypeVect::make(integral_elem_bt, num_elem);
Node* unpack_shuf = phase->transform(new VectorLoadShuffleNode(pack_shuf, newvt));
return new VectorRearrangeNode(in(1), unpack_shuf);
}
return nullptr;
}

#ifndef PRODUCT
void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
Expand Down
1 change: 0 additions & 1 deletion src/hotspot/share/opto/vectornode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,6 @@ class VectorRearrangeNode : public VectorNode {
// assert(mask->is_VectorMask(), "VectorBlendNode requires that third argument be a mask");
}

Node* Ideal(PhaseGVN* phase, bool can_reshape);
virtual int Opcode() const;
Node* vec1() const { return in(1); }
Node* vec_shuffle() const { return in(2); }
Expand Down
Loading