Skip to content
Open
7 changes: 3 additions & 4 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6608,17 +6608,16 @@ void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
}

void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
vector_len == AVX_256bit? VM_Version::supports_avx2() :
vector_len == AVX_512bit? VM_Version::supports_avx512bw() :
0, "");
assert(UseAVX > 0 && (vector_len == Assembler::AVX_512bit || (!needs_evex(dst, nds, src) || VM_Version::supports_avx512vl())), "");
assert(!needs_evex(dst, nds, src) || VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0F, (0xC0 | encode), imm8);
}

void Assembler::evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
Expand Down
12 changes: 6 additions & 6 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7115,7 +7115,7 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X
void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
assert(vlen_enc == Assembler::AVX_256bit, "");
if (origin <= 16) {
if (origin < 16) {
// ALIGNR instruction concatenates the corresponding 128 bit
// lanes of two source vectors and then performs the right
// shift operation over intermediate value. Thus source vectors
Expand Down Expand Up @@ -7156,7 +7156,7 @@ void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, X

void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (origin <= 16) {
if (origin < 16) {
// Initial source vectors
// 0.........512 0.........512
// src1 = [v1 v2 v3 v4] and src2 = [v5 v6 v7 v8]
Expand Down Expand Up @@ -7184,7 +7184,7 @@ void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, X
// |_____________|
evalignd(xtmp, src2, src1, 4, vlen_enc);
vpalignr(dst, xtmp, src1, origin, vlen_enc);
} else if (origin > 16 && origin <= 32) {
} else if (origin > 16 && origin < 32) {
// Similarly, for SHIFT between 16 and 32 bytes
// result will be sliced out of src1 and lower
// two 128 bit lanes of src2.
Expand All @@ -7198,7 +7198,7 @@ void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, X
evalignd(xtmp, src2, src1, 4, vlen_enc);
evalignd(dst, src2, src1, 8, vlen_enc);
vpalignr(dst, dst, xtmp, origin - 16, vlen_enc);
} else if (origin > 32 && origin <= 48) {
} else if (origin > 32 && origin < 48) {
// For SHIFT between 32 and 48 bytes
// result will be sliced out of src1 and lower
// four 128 bit lanes of src2.
Expand All @@ -7223,15 +7223,15 @@ void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, X
// res[511:384] = {src2[511:384], src2[383:256]}
// Thus, source vector lanes should have following format.
// src1 = {v4, v5, v6, v7} and src2 = {v5, v6, v7, v8}
assert(origin > 48 && origin <= 64, "");
assert(origin > 48 && origin < 64, "");
evalignd(xtmp, src2, src1, 12, vlen_enc);
vpalignr(dst, src2, xtmp, origin - 48, vlen_enc);
}
}

void C2_MacroAssembler::vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (VM_Version::supports_avx512vl() || vlen_enc == Assembler::AVX_512bit) {
if (VM_Version::supports_avx512vlbw()) {
vector_slice_64B_op(dst, src1, src2, xtmp, origin, vlen_enc);
} else {
assert(vlen_enc == Assembler::AVX_256bit, "");
Expand Down
34 changes: 32 additions & 2 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,9 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (UseAVX < 1 || size_in_bits < 128) {
return false;
}
if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
Expand Down Expand Up @@ -10776,9 +10779,23 @@ instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI o
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_index16B_reg(vec dst, vec src1, vec src2, immI origin)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() == 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
// src1 = [v2, v1], src2 = [v4, v3]
// dst = [v3, v2]
__ vperm2i128($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0x21);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI origin, vec xtmp)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw());
predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() != 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
effect(TEMP xtmp);
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
Expand All @@ -10789,9 +10806,22 @@ instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI o
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_index_multiple4_reg_evex(vec dst, vec src1, vec src2, immI origin)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) == 0);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int normalized_origin = $origin$$constant >> 2;
__ evalignd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, normalized_origin, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_reg_evex(vec dst, vec src1, vec src2, immI origin, vec xtmp)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw());
predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) != 0);
match(Set dst (VectorSlice (Binary src1 src2) origin));
effect(TEMP dst, TEMP xtmp);
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
Expand Down
7 changes: 4 additions & 3 deletions src/hotspot/share/opto/vectorIntrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1672,7 +1672,7 @@ bool LibraryCallKit::inline_vector_blend() {
// Class<? extends V> vClass, Class<E> eClass, int length, V v1, V v2,
// VectorSliceOp<V> defaultImpl)
bool LibraryCallKit::inline_vector_slice() {
const TypeInt* origin = gvn().type(argument(0))->isa_int();
const TypeInt* origin = gvn().type(argument(0))->isa_int();
const TypeInstPtr* vector_klass = gvn().type(argument(1))->isa_instptr();
const TypeInstPtr* elem_klass = gvn().type(argument(2))->isa_instptr();
const TypeInt* vlen = gvn().type(argument(3))->isa_int();
Expand Down Expand Up @@ -1700,13 +1700,13 @@ bool LibraryCallKit::inline_vector_slice() {
int num_elem = vlen->get_con();
BasicType elem_bt = elem_type->basic_type();

if (!Matcher::supports_vector_slice_with_non_constant_index(num_elem, elem_bt) && !origin->is_con()) {
if (Matcher::supports_vector_slice_with_non_constant_index(num_elem, elem_bt) || !origin->is_con()) {
log_if_needed(" ** vector slice from non-constant index not supported");
return false;
}

if (!arch_supports_vector(Op_VectorSlice, num_elem, elem_bt, VecMaskNotUsed)) {
log_if_needed(" ** not supported: arity=2 op=slice vlen=%d etype=%s ismask=useload/none",
log_if_needed(" ** not supported: arity=2 op=slice vlen=%d etype=%s",
num_elem, type2name(elem_bt));
return false; // not supported
}
Expand All @@ -1720,6 +1720,7 @@ bool LibraryCallKit::inline_vector_slice() {
return false; // operand unboxing failed
}

// Defining origin in terms of number of bytes to make it type agnostic value.
Node* origin_node = gvn().intcon(origin->get_con() * type2aelembytes(elem_bt));

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q1: Is it possible that just passing origin->get_con() to VectorSliceNode in case there are architectures that need it directly? Or, maybe we'd better add comment telling that the origin passed to VectorSliceNode is adjust to bytes.

Q2: If origin is not a constant, and there is an architecture that support the index as a variable, will the code crash here? Can we just limit the origin to a constant for this intrinsifaction in this PR? We can consider to extend it to variable in case any architecture has such requirement. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q1: Is it possible that just passing origin->get_con() to VectorSliceNode in case there are architectures that need it directly? Or, maybe we'd better add comment telling that the origin passed to VectorSliceNode is adjust to bytes.

Added comments.

Q2: If origin is not a constant, and there is an architecture that support the index as a variable, will the code crash here? Can we just limit the origin to a constant for this intrinsifaction in this PR? We can consider to extend it to variable in case any architecture has such a requirement. WDYT?

Currently, inline expander only supports constant origin. I have added a check to fail intrinsification and inline fallback using the hybrid call generator.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your updating! So maybe the matcher function supports_vector_slice_with_non_constant_index() could also be removed totally?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, idea here is just to intrinsify a perticular scenario where slice index is a constant value and not burden the inline expander with full-blown intrinsification of all possible control paths without impacting the performance.

const TypeVect* vector_type = TypeVect::make(elem_bt, num_elem);
Node* operation = gvn().transform(new VectorSliceNode(v1, v2, origin_node, vector_type));
Expand Down
15 changes: 15 additions & 0 deletions src/hotspot/share/opto/vectornode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2403,6 +2403,21 @@ Node* UMaxVNode::Identity(PhaseGVN* phase) {
}
return this;
}

Node* VectorSliceNode::Identity(PhaseGVN* phase) {
if (origin()->is_Con()) {
jint index = origin()->get_int();
uint vlen = vect_type()->length_in_bytes();
if (vlen == (uint)index) {
return vec2();
}
if (index == 0) {
return vec1();
}
}
return this;
}

#ifndef PRODUCT
void VectorBoxAllocateNode::dump_spec(outputStream *st) const {
CallStaticJavaNode::dump_spec(st);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/vectornode.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,7 @@ class VectorSliceNode : public VectorNode {
Node* vec1() const { return in(1); }
Node* vec2() const { return in(2); }
Node* origin() const { return in(3); }
virtual Node* Identity(PhaseGVN* phase);
};


Expand Down
20 changes: 20 additions & 0 deletions test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -2240,6 +2240,26 @@ public class IRNode {
machOnlyNameRegex(RISCV_VAND_NOTL_VX_MASKED, "vand_notL_vx_masked");
}

public static final String VECTOR_SLICE_B = VECTOR_PREFIX + "VECTOR_SLICE_B" + POSTFIX;
static {
vectorNode(VECTOR_SLICE_B, "VectorSlice", TYPE_BYTE);
}

public static final String VECTOR_SLICE_S = VECTOR_PREFIX + "VECTOR_SLICE_S" + POSTFIX;
static {
vectorNode(VECTOR_SLICE_S, "VectorSlice", TYPE_SHORT);
}

public static final String VECTOR_SLICE_I = VECTOR_PREFIX + "VECTOR_SLICE_I" + POSTFIX;
static {
vectorNode(VECTOR_SLICE_I, "VectorSlice", TYPE_INT);
}

public static final String VECTOR_SLICE_L = VECTOR_PREFIX + "VECTOR_SLICE_L" + POSTFIX;
static {
vectorNode(VECTOR_SLICE_L, "VectorSlice", TYPE_LONG);
}

public static final String VECTOR_BLEND_B = VECTOR_PREFIX + "VECTOR_BLEND_B" + POSTFIX;
static {
vectorNode(VECTOR_BLEND_B, "VectorBlend", TYPE_BYTE);
Expand Down
Loading