Skip to content
Open
14 changes: 11 additions & 3 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6608,14 +6608,22 @@ void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
}

void Assembler::vpalignr(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len) {
assert(vector_len == AVX_128bit? VM_Version::supports_avx() :
vector_len == AVX_256bit? VM_Version::supports_avx2() :
0, "");
assert(UseAVX > 0 && (vector_len == Assembler::AVX_512bit || (!needs_evex(dst, nds, src) || VM_Version::supports_avx512vl())), "");
assert(!needs_evex(dst, nds, src) || VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x0F, (0xC0 | encode), imm8);
}

void Assembler::evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len) {
assert(VM_Version::supports_evex(), "");
assert(vector_len == AVX_512bit || VM_Version::supports_avx512vl(), "");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24(0x3, (0xC0 | encode), imm8);
}

void Assembler::evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8) {
assert(VM_Version::supports_evex(), "");
InstructionAttr attributes(AVX_512bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2167,6 +2167,7 @@ class Assembler : public AbstractAssembler {
void palignr(XMMRegister dst, XMMRegister src, int imm8);
void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
void evalignq(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
void evalignd(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8, int vector_len);

void pblendw(XMMRegister dst, XMMRegister src, int imm8);
void vblendps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
Expand Down
127 changes: 127 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7112,6 +7112,133 @@ void C2_MacroAssembler::vector_saturating_op(int ideal_opc, BasicType elem_bt, X
}
}

void C2_MacroAssembler::vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
assert(vlen_enc == Assembler::AVX_256bit, "");
if (origin < 16) {
// ALIGNR instruction concatenates the corresponding 128 bit
// lanes of two source vectors and then performs the right
// shift operation over intermediate value. Thus source vectors
// lanes needs to shuffled to a format consumable by ALIGNR.
// i.e.
// Initial source vectors
// 0...256 0...256
// src1 = [v1 v2] and src2= [v3 v4]
// Formatted source vectors when SHIFT < 16 bytes
// 0...256 0...256
// src1 = [v1 v2] and src2 = [v2 v3]
// Higher 128bit lane of src2 will not impact result, which will be
// sliced from lower and higher 128 bit lane of src1 and lower 128 bit
// lane of src2.
// i.e.
// Result lanes
// res[127:0] = {src1[255:128] , src1[127:0]} >> SHIFT
// res[255:128] = {src2[127:0] , src1[255:128]} >> SHIFT
vperm2i128(xtmp, src1, src2, 0x21);
vpalignr(dst, xtmp, src1, origin, Assembler::AVX_256bit);
} else {
assert(origin > 16 && origin <= 32, "");
// Similarly, when SHIFT >= 16 bytes, lower 128bit lane of
// src1 will not impact result, which will be sliced from
// higher 128 bit lane of src1 and lower and upper 128 bit
// lanes of src2.
// Thus, two source vector should have following format
// 0...256 0...256
// src1 = [v2 v3] and src2 = [v3 v4]
// Result lanes
// res[127:0] = {src2[127:0] , src1[255:127]} >> SHIFT
// res[255:128] = {src2[255:128] , src2[127:0]} >> SHIFT
vperm2i128(xtmp, src1, src2, 0x21);
vpalignr(dst, src2, xtmp, origin - 16, Assembler::AVX_256bit);
}
}


void C2_MacroAssembler::vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (origin < 16) {
// Initial source vectors
// 0.........512 0.........512
// src1 = [v1 v2 v3 v4] and src2 = [v5 v6 v7 v8]
// where v* represents 128 bit wide vector lanes.
// When SHIFT <= 16 result will be sliced out from src1 and
// lowest 128 bit vector lane
// of src2.
// ALIGNR will consider following source vector lanes pairs
// spread across two source vectors in order to compute 128 bit
// lanes of result vector.
// res[127:0] = {src1[255:128], src1[127:0]}
// res[255:128] = {src1[383:256], src1[255:128]}
// res[383:256] = {src1[511:384], src1[383:256]}
// res[511:384] = {src2[127:0], src1[511:384]}
//
// ALIGNR concatenates corresponding lanes across source vectors
// before right shifting the intermediate result. Therefore, source
// vector lanes should be shuffled to have following format
// src1 = {v1, v2, v3, v4} and src2 = {v2, v3, v4, v5}
//
// |-------------|
// |-----|--------| |
// alignr -> [v1 v2 v3 v4] [v2 v3 v4 v5]
// |_____|________| |
// |_____________|
evalignd(xtmp, src2, src1, 4, vlen_enc);
vpalignr(dst, xtmp, src1, origin, vlen_enc);
} else if (origin > 16 && origin < 32) {
// Similarly, for SHIFT between 16 and 32 bytes
// result will be sliced out of src1 and lower
// two 128 bit lanes of src2.
// i.e.
// res[127:0] = {src1[383:256], src1[255:128]}
// res[255:128] = {src1[511:384], src1[383:256]}
// res[383:256] = {src2[127:0], src1[511:384]}
// res[511:384] = {src2[255:128], src2[127:0]}
// Thus, source vector lanes should have following format.
// src1 = {v2, v3, v4, v5} and src2 = {v3, v4, v5, v6}
evalignd(xtmp, src2, src1, 4, vlen_enc);
evalignd(dst, src2, src1, 8, vlen_enc);
vpalignr(dst, dst, xtmp, origin - 16, vlen_enc);
} else if (origin > 32 && origin < 48) {
// For SHIFT between 32 and 48 bytes
// result will be sliced out of src1 and lower
// four 128 bit lanes of src2.
// i.e.
// res[127:0] = {src1[511:384], src1[383:255]}
// res[255:128] = {src2[127:0], src1[511:384]}
// res[383:256] = {src2[255:128], src2[127:0]}
// res[511:384] = {src2[383:256], src2[255:128]}
// Thus, source vector lanes should have following format.
// src1 = {v3, v4, v5, v6} and src2 = {v4, v5, v6, v7}
evalignd(xtmp, src2, src1, 8, vlen_enc);
evalignd(dst, src2, src1, 12, vlen_enc);
vpalignr(dst, dst, xtmp, origin - 32, vlen_enc);
} else {
// Finally, for SHIFT greater than 48 bytes
// result will be sliced out of upper 128 bit lane of src1 and
// src2.
// i.e.
// res[127:0] = {src2[127:0], src1[511:383]}
// res[255:128] = {src2[255:127], src2[127:0]}
// res[383:256] = {src2[383:256], src2[255:128]}
// res[511:384] = {src2[511:384], src2[383:256]}
// Thus, source vector lanes should have following format.
// src1 = {v4, v5, v6, v7} and src2 = {v5, v6, v7, v8}
assert(origin > 48 && origin < 64, "");
evalignd(xtmp, src2, src1, 12, vlen_enc);
vpalignr(dst, src2, xtmp, origin - 48, vlen_enc);
}
}

void C2_MacroAssembler::vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister xtmp, int origin, int vlen_enc) {
if (VM_Version::supports_avx512vlbw()) {
vector_slice_64B_op(dst, src1, src2, xtmp, origin, vlen_enc);
} else {
assert(vlen_enc == Assembler::AVX_256bit, "");
vector_slice_32B_op(dst, src1, src2, xtmp, origin, vlen_enc);
}
}

void C2_MacroAssembler::evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc) {
switch(opcode) {
case Op_AddVHF: evaddph(dst, src1, src2, vlen_enc); break;
Expand Down
6 changes: 6 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,12 @@

void select_from_two_vectors_evex(BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);

void vector_slice_32B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

void vector_slice_64B_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

void vector_slice_op(XMMRegister dst, XMMRegister src1, XMMRegister src2, XMMRegister xtmp, int origin, int vlen_enc);

void evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2, int vlen_enc);

void evfp16ph(int opcode, XMMRegister dst, XMMRegister src1, Address src2, int vlen_enc);
Expand Down
72 changes: 72 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1723,6 +1723,14 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
return false; // Implementation limitation
}
break;
case Op_VectorSlice:
if (UseAVX < 1 || size_in_bits < 128) {
return false;
}
if (size_in_bits == 512 && !VM_Version::supports_avx512bw()) {
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
if(vlen == 2) {
Expand Down Expand Up @@ -10759,6 +10767,70 @@ instruct scalar_fma_HF_reg(regF dst, regF src1, regF src2)
ins_pipe( pipe_slow );
%}

instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin)
instruct vector_slice_const_origin_EQ16B_reg(vec dst, vec src1, vec src2, immI origin)

Or

Suggested change
instruct vector_slice_const_origin_LT16B_reg(vec dst, vec src1, vec src2, immI origin)
instruct vector_slice_const_origin_16B_reg(vec dst, vec src1, vec src2, immI origin)

%{
predicate(Matcher::vector_length_in_bytes(n) == 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vpalignr($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, $origin$$constant, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_index16B_reg(vec dst, vec src1, vec src2, immI origin)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() == 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
// src1 = [v2, v1], src2 = [v4, v3]
// dst = [v3, v2]
__ vperm2i128($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, 0x21);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_reg(vec dst, vec src1, vec src2, immI origin, vec xtmp)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && !VM_Version::supports_avx512vlbw() && n->in(2)->get_int() != 16);
match(Set dst (VectorSlice (Binary src1 src2) origin));
effect(TEMP xtmp);
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_index_multiple4_reg_evex(vec dst, vec src1, vec src2, immI origin)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) == 0);
match(Set dst (VectorSlice (Binary src1 src2) origin));
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
int normalized_origin = $origin$$constant >> 2;
__ evalignd($dst$$XMMRegister, $src2$$XMMRegister, $src1$$XMMRegister, normalized_origin, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_slice_const_origin_GT16B_reg_evex(vec dst, vec src1, vec src2, immI origin, vec xtmp)
%{
predicate(Matcher::vector_length_in_bytes(n) > 16 && VM_Version::supports_avx512vlbw() && (n->in(2)->get_int() & 0x3) != 0);
match(Set dst (VectorSlice (Binary src1 src2) origin));
effect(TEMP dst, TEMP xtmp);
format %{ "vector_slice_const_origin $dst, $origin, $src1, $src2 \t!using $xtmp as TEMP" %}
ins_encode %{
int vlen_enc = vector_length_encoding(this);
__ vector_slice_op($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $xtmp$$XMMRegister, $origin$$constant, vlen_enc);
%}
ins_pipe(pipe_slow);
%}

instruct vector_sqrt_HF_reg(vec dst, vec src)
%{
Expand Down
2 changes: 1 addition & 1 deletion src/hotspot/share/adlc/formssel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4363,7 +4363,7 @@ bool MatchRule::is_vector() const {
"VectorRearrange", "VectorLoadShuffle", "VectorLoadConst",
"VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
"VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F",
"VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
"VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", "VectorSlice",
"VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
"FmaVD", "FmaVF", "FmaVHF", "PopCountVI", "PopCountVL", "PopulateIndex", "VectorLongToMask",
"CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV",
Expand Down
12 changes: 12 additions & 0 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1167,6 +1167,18 @@ class methodHandle;
"Z") \
do_name(vector_test_name, "test") \
\
do_intrinsic(_VectorSlice, jdk_internal_vm_vector_VectorSupport, vector_slice_name, vector_slice_sig, F_S) \
do_signature(vector_slice_sig, "(I" \
"Ljava/lang/Class;" \
"Ljava/lang/Class;" \
"I" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
"Ljdk/internal/vm/vector/VectorSupport$VectorSliceOp;)" \
"Ljdk/internal/vm/vector/VectorSupport$Vector;") \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems this \ is not aligned ?

do_name(vector_slice_name, "sliceOp") \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto

\
\
do_intrinsic(_VectorBlend, jdk_internal_vm_vector_VectorSupport, vector_blend_name, vector_blend_sig, F_S) \
do_signature(vector_blend_sig, "(Ljava/lang/Class;" \
"Ljava/lang/Class;" \
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/c2compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,7 @@ bool C2Compiler::is_intrinsic_supported(vmIntrinsics::ID id) {
case vmIntrinsics::_VectorSelectFromTwoVectorOp:
case vmIntrinsics::_VectorGatherOp:
case vmIntrinsics::_VectorScatterOp:
case vmIntrinsics::_VectorSlice:
case vmIntrinsics::_VectorReductionCoerced:
case vmIntrinsics::_VectorTest:
case vmIntrinsics::_VectorBlend:
Expand Down
33 changes: 33 additions & 0 deletions src/hotspot/share/opto/callGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,31 @@ CallGenerator* CallGenerator::for_mh_late_inline(ciMethod* caller, ciMethod* cal
return cg;
}

class LateInlineVectorCallGenerator : public LateInlineCallGenerator {
protected:
CallGenerator* _inline_cg;

public:
LateInlineVectorCallGenerator(ciMethod* method, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) :
LateInlineCallGenerator(method, intrinsic_cg) , _inline_cg(inline_cg) {}

CallGenerator* inline_cg2() const { return _inline_cg; }
bool inline_fallback();
virtual bool is_vector_late_inline() const { return true; }
};

bool LateInlineVectorCallGenerator::inline_fallback() {
switch (method()->intrinsic_id()) {
case vmIntrinsics::_VectorSlice: return true;
default : return false;
}
}

CallGenerator* CallGenerator::for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg) {
return new LateInlineVectorCallGenerator(m, intrinsic_cg, inline_cg);
}


// Allow inlining decisions to be delayed
class LateInlineVirtualCallGenerator : public VirtualCallGenerator {
private:
Expand Down Expand Up @@ -673,6 +698,14 @@ void CallGenerator::do_late_inline_helper() {

// Now perform the inlining using the synthesized JVMState
JVMState* new_jvms = inline_cg()->generate(jvms);
// Attempt inlining fallback implementation in case of
// intrinsification failure.
if (new_jvms == nullptr && is_vector_late_inline()) {
LateInlineVectorCallGenerator* late_inline_vec_cg = static_cast<LateInlineVectorCallGenerator*>(this);
if (late_inline_vec_cg->inline_fallback()) {
new_jvms = late_inline_vec_cg->inline_cg2()->generate(jvms);
}
}
if (new_jvms == nullptr) return; // no change
if (C->failing()) return;

Expand Down
4 changes: 3 additions & 1 deletion src/hotspot/share/opto/callGenerator.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -75,6 +75,7 @@ class CallGenerator : public ArenaObj {
// same but for method handle calls
virtual bool is_mh_late_inline() const { return false; }
virtual bool is_string_late_inline() const { return false; }
virtual bool is_vector_late_inline() const { return false; }
virtual bool is_boxing_late_inline() const { return false; }
virtual bool is_vector_reboxing_late_inline() const { return false; }
virtual bool is_virtual_late_inline() const { return false; }
Expand Down Expand Up @@ -141,6 +142,7 @@ class CallGenerator : public ArenaObj {
static CallGenerator* for_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_mh_late_inline(ciMethod* caller, ciMethod* callee, bool input_not_const);
static CallGenerator* for_string_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_late_inline(ciMethod* m, CallGenerator* intrinsic_cg, CallGenerator* inline_cg);
static CallGenerator* for_boxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_vector_reboxing_late_inline(ciMethod* m, CallGenerator* inline_cg);
static CallGenerator* for_late_inline_virtual(ciMethod* m, int vtable_index, float expected_uses);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,7 @@ macro(VectorRearrange)
macro(VectorLoadMask)
macro(VectorLoadShuffle)
macro(VectorLoadConst)
macro(VectorSlice)
macro(VectorStoreMask)
macro(VectorReinterpret)
macro(VectorCast)
Expand Down
Loading