diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 76e3c92ddc261..3cd88eec8506e 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2014, 2024, Red Hat, Inc. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -2385,6 +2385,20 @@ uint Matcher::vector_ideal_reg(int len) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int len) { + assert(MaxVectorSize >= len, ""); + uint ideal_reg = vector_ideal_reg(len); + switch (ideal_reg) { + case Op_VecD: return 8; + case Op_VecX: return 16; + case Op_VecA: return MaxVectorSize; + default: + ShouldNotReachHere(); + return 0; + } +} + MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) { assert(Matcher::is_generic_vector(generic_opnd), "not generic"); switch (ideal_reg) { @@ -2631,12 +2645,13 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) { // into registers? bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) { - // Loads and stores with indirect memory input (e.g., volatile loads and - // stores) do not subsume the input into complex addressing expressions. If - // the addressing expression is input to at least one such load or store, do - // not clone the addressing expression. Query needs_acquiring_load and - // needs_releasing_store as a proxy for indirect memory input, as it is not - // possible to directly query for indirect memory input at this stage. + // Loads and stores with indirect memory input (e.g., volatile loads/stores, + // and vector gather_loads/scatter_stores) do not subsume the input into + // complex addressing expressions. If the addressing expression is input + // to at least one such load or store, do not clone the addressing expression. + // Query needs_acquiring_load and needs_releasing_store as a proxy for + // indirect memory input, as it is not possible to directly query for indirect + // memory input at this stage. for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) { Node* n = m->fast_out(i); if (n->is_Load() && needs_acquiring_load(n)) { @@ -2645,6 +2660,13 @@ bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, if (n->is_Store() && needs_releasing_store(n)) { return false; } + + if (n->is_LoadVectorGather() || + n->is_StoreVectorScatter() || + n->is_LoadVectorGatherMasked() || + n->is_StoreVectorScatterMasked()) { + return false; + } } if (clone_base_plus_offset_address(m, mstack, address_visited)) { diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index c7a0fc5724b2c..2f7a399a7dc3d 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -168,22 +168,21 @@ source %{ case Op_MaskAll: case Op_VectorMaskGen: case Op_LoadVectorMasked: + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: case Op_StoreVectorMasked: case Op_StoreVectorScatter: case Op_StoreVectorScatterMasked: case Op_PopulateIndex: case Op_CompressM: case Op_CompressV: + // Temporarily disable vector mask widen support for NEON, + // as we do not have the use case now. + case Op_VectorMaskWiden: if (UseSVE == 0) { return false; } break; - case Op_LoadVectorGather: - case Op_LoadVectorGatherMasked: - if (UseSVE == 0 || is_subword_type(bt)) { - return false; - } - break; case Op_MulAddVS2VI: if (length_in_bytes != 16) { return false; @@ -325,6 +324,11 @@ source %{ return false; } + // SVE always needs the vector index for gather/scatter. + bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return true; + } + // Assert that the given node is not a variable shift. bool assert_not_var_shift(const Node* n) { assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift"); @@ -5075,6 +5079,35 @@ instruct extractD(vRegD dst, vReg src, immI idx) %{ ins_pipe(pipe_slow); %} +// ---------------------------- Vector Slice ------------------------ + +instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{ + predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (VectorSlice (Binary src1 src2) index)); + format %{ "vslice_neon $dst, $src1, $src2, $index" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, + $src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + +instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{ + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index)); + format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector mask load/store ----------------------- // vector load mask @@ -5738,6 +5771,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{ ins_pipe(pipe_slow); %} +// Vector mask widen to twice size +// +// Unpack elements from the lowest or highest half of the source +// predicate and place in elements of twice their size within the +// destination predicate. + +instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_lo_sve $dst, $src" %} + ins_encode %{ + __ sve_punpklo($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_hi_sve $dst, $src" %} + ins_encode %{ + __ sve_punpkhi($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + // vector mask reinterpret instruct vmask_reinterpret_same_esize(pReg dst_src) %{ @@ -6471,6 +6530,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ // ------------------------------ Vector Load Gather --------------------------- +instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); @@ -6481,7 +6589,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_ld1w_gather($dst$$FloatRegister, ptrue, as_Register($mem$$base), $idx$$FloatRegister); - %} + %} ins_pipe(pipe_slow); %} @@ -6501,6 +6609,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{ ins_pipe(pipe_slow); %} +instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 171bc39054549..3b2317501ce14 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -158,22 +158,21 @@ source %{ case Op_MaskAll: case Op_VectorMaskGen: case Op_LoadVectorMasked: + case Op_LoadVectorGather: + case Op_LoadVectorGatherMasked: case Op_StoreVectorMasked: case Op_StoreVectorScatter: case Op_StoreVectorScatterMasked: case Op_PopulateIndex: case Op_CompressM: case Op_CompressV: + // Temporarily disable vector mask widen support for NEON, + // as we do not have the use case now. + case Op_VectorMaskWiden: if (UseSVE == 0) { return false; } break; - case Op_LoadVectorGather: - case Op_LoadVectorGatherMasked: - if (UseSVE == 0 || is_subword_type(bt)) { - return false; - } - break; case Op_MulAddVS2VI: if (length_in_bytes != 16) { return false; @@ -315,6 +314,11 @@ source %{ return false; } + // SVE always needs the vector index for gather/scatter. + bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return true; + } + // Assert that the given node is not a variable shift. bool assert_not_var_shift(const Node* n) { assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift"); @@ -3313,6 +3317,35 @@ EXTRACT_FP(F, fmovs, 4, S, 2) // DOUBLE EXTRACT_FP(D, fmovd, 2, D, 3) +// ---------------------------- Vector Slice ------------------------ + +instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{ + predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst (VectorSlice (Binary src1 src2) index)); + format %{ "vslice_neon $dst, $src1, $src2, $index" %} + ins_encode %{ + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B, + $src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + +instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{ + predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n))); + match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index)); + format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %} + ins_encode %{ + assert(UseSVE > 0, "must be sve"); + uint scale = type2aelembytes(Matcher::vector_element_basic_type(this)); + __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister, + ((uint)$index$$constant * scale)); + %} + ins_pipe(pipe_slow); +%} + // ------------------------------ Vector mask load/store ----------------------- // vector load mask @@ -3885,6 +3918,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{ ins_pipe(pipe_slow); %} +// Vector mask widen to twice size +// +// Unpack elements from the lowest or highest half of the source +// predicate and place in elements of twice their size within the +// destination predicate. + +instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_lo_sve $dst, $src" %} + ins_encode %{ + __ sve_punpklo($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + +instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{ + predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo()); + match(Set dst (VectorMaskWiden src)); + format %{ "vmaskwiden_hi_sve $dst, $src" %} + ins_encode %{ + __ sve_punpkhi($dst$$PRegister, $src$$PRegister); + %} + ins_pipe(pipe_slow); +%} + // vector mask reinterpret instruct vmask_reinterpret_same_esize(pReg dst_src) %{ @@ -4574,6 +4633,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ // ------------------------------ Vector Load Gather --------------------------- +instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16); + match(Set dst (LoadVectorGather mem idx)); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, ptrue, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); @@ -4584,7 +4692,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{ assert(length_in_bytes == MaxVectorSize, "invalid vector length"); __ sve_ld1w_gather($dst$$FloatRegister, ptrue, as_Register($mem$$base), $idx$$FloatRegister); - %} + %} ins_pipe(pipe_slow); %} @@ -4604,6 +4712,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{ ins_pipe(pipe_slow); %} +instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst); + format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S); + } + %} + ins_pipe(pipe_slow); +%} + +instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{ + predicate(UseSVE > 0 && + type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 && + Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16); + match(Set dst (LoadVectorGatherMasked mem (Binary idx pg))); + effect(TEMP_DEF dst, TEMP vtmp); + format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + __ sve_dup($vtmp$$FloatRegister, __ S, 0); + if (bt == T_BYTE) { + __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister); + } else { + assert(bt == T_SHORT, "unsupported type"); + __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister, + as_Register($mem$$base), $idx$$FloatRegister); + __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister); + } + %} + ins_pipe(pipe_slow); +%} + instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{ predicate(UseSVE > 0 && type2aelembytes(Matcher::vector_element_basic_type(n)) == 4); diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp index 3db7d30884429..8625481678c1e 100644 --- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp @@ -3616,6 +3616,10 @@ template f(op1, 31, 25), f(type, 24, 23), f(op2, 22, 21), rf(Zm, 16); \ f(op3, 15, 13), pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0); \ } + // SVE 8-bit gather load bytes (scalar plus 32-bit unscaled offsets) + INSN(sve_ld1b_gather, 0b1000010, 0b00, 0b00, 0b010); + // SVE 16-bit gather load halfwords (scalar plus 32-bit scaled offsets) + INSN(sve_ld1h_gather, 0b1000010, 0b01, 0b01, 0b010); // SVE 32-bit gather load words (scalar plus 32-bit scaled offsets) INSN(sve_ld1w_gather, 0b1000010, 0b10, 0b01, 0b010); // SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets) diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index f3b97d23ad306..5c1bcb2e53fb7 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -1003,6 +1003,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } @@ -1037,6 +1041,12 @@ uint Matcher::vector_ideal_reg(int size) { return 0; } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize >= size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 07d681e89823e..b71250ebdf80f 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2162,6 +2162,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } @@ -2198,6 +2202,12 @@ uint Matcher::vector_ideal_reg(int size) { } } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize == size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index aca2f4dd488ae..f6561fbd88970 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. // Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -2029,6 +2029,12 @@ uint Matcher::vector_ideal_reg(int len) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int len) { + assert(MaxVectorSize >= len, ""); + return MaxVectorSize; +} + int Matcher::scalable_vector_reg_size(const BasicType bt) { return Matcher::max_vector_size(bt); } diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad index 1e99b2ece2914..c51aad8e8fecb 100644 --- a/src/hotspot/cpu/riscv/riscv_v.ad +++ b/src/hotspot/cpu/riscv/riscv_v.ad @@ -138,6 +138,10 @@ source %{ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) { return false; } + + bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return !is_subword_type(elem_bt); + } %} definitions %{ diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index c32064be86d87..5771d0a77c5cb 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2017, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2017, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2017, 2024 SAP SE. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -1809,6 +1809,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen return false; } +bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return false; +} + const RegMask* Matcher::predicate_reg_mask(void) { return nullptr; } @@ -1847,6 +1851,12 @@ uint Matcher::vector_ideal_reg(int size) { } } +// Vector ideal reg size corresponding to the specified size in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize == size, ""); + return size; +} + // Limits on vector size (number of elements) loaded into vector. int Matcher::max_vector_size(const BasicType bt) { assert(is_java_primitive(bt), "only primitive type vectors"); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp index a7967d83a4e7f..fb4eab9c63755 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp @@ -1478,23 +1478,18 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, } } -void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, - XMMRegister dst, Register base, - Register idx_base, - Register offset, Register mask, - Register mask_idx, Register rtmp, - int vlen_enc) { +void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst, + Register base, Register idx_base, + Register mask, Register mask_idx, + Register rtmp, int vlen_enc) { vpxor(dst, dst, dst, vlen_enc); if (elem_bt == T_SHORT) { for (int i = 0; i < 4; i++) { - // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 + // dst[i] = mask[i] ? src[idx_base[i]] : 0 Label skip_load; btq(mask, mask_idx); jccb(Assembler::carryClear, skip_load); movl(rtmp, Address(idx_base, i * 4)); - if (offset != noreg) { - addl(rtmp, offset); - } pinsrw(dst, Address(base, rtmp, Address::times_2), i); bind(skip_load); incq(mask_idx); @@ -1502,14 +1497,11 @@ void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, } else { assert(elem_bt == T_BYTE, ""); for (int i = 0; i < 8; i++) { - // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0 + // dst[i] = mask[i] ? src[idx_base[i]] : 0 Label skip_load; btq(mask, mask_idx); jccb(Assembler::carryClear, skip_load); movl(rtmp, Address(idx_base, i * 4)); - if (offset != noreg) { - addl(rtmp, offset); - } pinsrb(dst, Address(base, rtmp), i); bind(skip_load); incq(mask_idx); @@ -1517,28 +1509,21 @@ void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt, } } -void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, - Register base, Register idx_base, - Register offset, Register rtmp, - int vlen_enc) { +void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst, + Register base, Register idx_base, + Register rtmp, int vlen_enc) { vpxor(dst, dst, dst, vlen_enc); if (elem_bt == T_SHORT) { for (int i = 0; i < 4; i++) { - // dst[i] = src[offset + idx_base[i]] + // dst[i] = src[idx_base[i]] movl(rtmp, Address(idx_base, i * 4)); - if (offset != noreg) { - addl(rtmp, offset); - } pinsrw(dst, Address(base, rtmp, Address::times_2), i); } } else { assert(elem_bt == T_BYTE, ""); for (int i = 0; i < 8; i++) { - // dst[i] = src[offset + idx_base[i]] + // dst[i] = src[idx_base[i]] movl(rtmp, Address(idx_base, i * 4)); - if (offset != noreg) { - addl(rtmp, offset); - } pinsrb(dst, Address(base, rtmp), i); } } @@ -1567,11 +1552,10 @@ void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst, */ void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, - Register offset, Register mask, - XMMRegister xtmp1, XMMRegister xtmp2, - XMMRegister temp_dst, Register rtmp, - Register mask_idx, Register length, - int vector_len, int vlen_enc) { + Register mask, XMMRegister xtmp1, + XMMRegister xtmp2, XMMRegister temp_dst, + Register rtmp, Register mask_idx, + Register length, int vector_len, int vlen_enc) { Label GATHER8_LOOP; assert(is_subword_type(elem_ty), ""); movl(length, vector_len); @@ -1585,9 +1569,9 @@ void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst, bind(GATHER8_LOOP); // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES if (mask == noreg) { - vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc); + vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc); } else { - vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc); + vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc); } // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1) vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit); diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp index dd2880d88c381..21aa899766546 100644 --- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp +++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp @@ -489,15 +489,14 @@ void efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2); - void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register offset, - Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, + void vgather_subword(BasicType elem_ty, XMMRegister dst, Register base, Register idx_base, Register mask, + XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, Register midx, Register length, int vector_len, int vlen_enc); - void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, - Register offset, Register mask, Register midx, Register rtmp, int vlen_enc); - - void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, - Register offset, Register rtmp, int vlen_enc); + void vgather8b_masked(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, + Register mask, Register midx, Register rtmp, int vlen_enc); + void vgather8b(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base, + Register rtmp, int vlen_enc); void vector_saturating_op(int opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc); diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index a281331cb2986..ad109af47b2c8 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -2100,6 +2100,11 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen } } +// Return true if gather/scatter needs vector index as input. +bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) { + return !is_subword_type(elem_bt); +} + MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) { assert(Matcher::is_generic_vector(generic_opnd), "not generic"); bool legacy = (generic_opnd->opcode() == LEGVEC); @@ -2247,6 +2252,12 @@ uint Matcher::vector_ideal_reg(int size) { return 0; } +// Vector ideal reg size corresponding to the specified len in bytes +uint Matcher::vector_ideal_reg_size(int size) { + assert(MaxVectorSize >= size, ""); + return size; +} + // Check for shift by small constant as well static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) { if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() && @@ -4023,24 +4034,24 @@ instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRe ins_pipe( pipe_slow ); %} -instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{ +instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{ predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + match(Set dst (LoadVectorGather mem idx_base)); effect(TEMP tmp, TEMP rtmp); format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %} ins_encode %{ int vlen_enc = vector_length_encoding(this); BasicType elem_bt = Matcher::vector_element_basic_type(this); __ lea($tmp$$Register, $mem$$Address); - __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc); + __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc); %} ins_pipe( pipe_slow ); %} -instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp, +instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp, vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{ predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGather mem (Binary idx_base offset))); + match(Set dst (LoadVectorGather mem idx_base)); effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr); format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %} ins_encode %{ @@ -4049,49 +4060,15 @@ instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, BasicType elem_bt = Matcher::vector_element_basic_type(this); __ lea($tmp$$Register, $mem$$Address); __ movptr($idx_base_temp$$Register, $idx_base$$Register); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister, + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc); %} ins_pipe( pipe_slow ); %} -instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{ - predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGather mem (Binary idx_base offset))); - effect(TEMP tmp, TEMP rtmp, KILL cr); - format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ lea($tmp$$Register, $mem$$Address); - __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - - -instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp, - vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{ - predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGather mem (Binary idx_base offset))); - effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr); - format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - int vector_len = Matcher::vector_length(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ lea($tmp$$Register, $mem$$Address); - __ movptr($idx_base_temp$$Register, $idx_base$$Register); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister, - $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - - -instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{ +instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{ predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask))); effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} ins_encode %{ @@ -4100,15 +4077,15 @@ instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, im __ xorq($mask_idx$$Register, $mask_idx$$Register); __ lea($tmp$$Register, $mem$$Address); __ kmovql($rtmp2$$Register, $mask$$KRegister); - __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); %} ins_pipe( pipe_slow ); %} -instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp, +instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp, vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{ predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask))); effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} ins_encode %{ @@ -4119,52 +4096,15 @@ instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, im __ lea($tmp$$Register, $mem$$Address); __ movptr($idx_base_temp$$Register, $idx_base$$Register); __ kmovql($rtmp2$$Register, $mask$$KRegister); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister, + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); %} ins_pipe( pipe_slow ); %} -instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{ - predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); - effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); - format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ xorq($mask_idx$$Register, $mask_idx$$Register); - __ lea($tmp$$Register, $mem$$Address); - __ kmovql($rtmp2$$Register, $mask$$KRegister); - __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, - $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - -instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp, - vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{ - predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); - effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); - format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - int vector_len = Matcher::vector_length(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ xorq($mask_idx$$Register, $mask_idx$$Register); - __ lea($tmp$$Register, $mem$$Address); - __ movptr($idx_base_temp$$Register, $idx_base$$Register); - __ kmovql($rtmp2$$Register, $mask$$KRegister); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, - $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - -instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{ +instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{ predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask))); effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} ins_encode %{ @@ -4177,15 +4117,15 @@ instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, im __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); } __ xorl($mask_idx$$Register, $mask_idx$$Register); - __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); + __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); %} ins_pipe( pipe_slow ); %} -instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp, +instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp, vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{ predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); + match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask))); effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} ins_encode %{ @@ -4200,53 +4140,7 @@ instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, im __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); } __ xorl($mask_idx$$Register, $mask_idx$$Register); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister, - $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - -instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{ - predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); - effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr); - format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ lea($tmp$$Register, $mem$$Address); - __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); - if (elem_bt == T_SHORT) { - __ movl($mask_idx$$Register, 0x55555555); - __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); - } - __ xorl($mask_idx$$Register, $mask_idx$$Register); - __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, - $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc); - %} - ins_pipe( pipe_slow ); -%} - -instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp, - vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{ - predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8); - match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset)))); - effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr); - format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %} - ins_encode %{ - int vlen_enc = vector_length_encoding(this); - int vector_len = Matcher::vector_length(this); - BasicType elem_bt = Matcher::vector_element_basic_type(this); - __ xorl($mask_idx$$Register, $mask_idx$$Register); - __ lea($tmp$$Register, $mem$$Address); - __ movptr($idx_base_temp$$Register, $idx_base$$Register); - __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc); - if (elem_bt == T_SHORT) { - __ movl($mask_idx$$Register, 0x55555555); - __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register); - } - __ xorl($mask_idx$$Register, $mask_idx$$Register); - __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, + __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc); %} ins_pipe( pipe_slow ); diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index b938d5b75608d..466b9f8d7cb4e 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4360,7 +4360,7 @@ bool MatchRule::is_vector() const { "RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector", "LoadVectorGather", "StoreVectorScatter", "LoadVectorGatherMasked", "StoreVectorScatterMasked", "SelectFromTwoVector", "VectorTest", "VectorLoadMask", "VectorStoreMask", "VectorBlend", "VectorInsert", - "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", + "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", "VectorSlice", "VectorCastB2X", "VectorCastS2X", "VectorCastI2X", "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F", "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X", @@ -4368,7 +4368,7 @@ bool MatchRule::is_vector() const { "FmaVD", "FmaVF", "FmaVHF", "PopCountVI", "PopCountVL", "PopulateIndex", "VectorLongToMask", "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV", // Next are vector mask ops. - "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", + "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", "VectorMaskWiden", "RoundVF", "RoundVD", // Next are not supported currently. "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp index 49446b53b98bb..c6254b0bcaefe 100644 --- a/src/hotspot/share/classfile/vmIntrinsics.hpp +++ b/src/hotspot/share/classfile/vmIntrinsics.hpp @@ -1223,9 +1223,13 @@ class methodHandle; "Ljava/lang/Class;" \ "I" \ "Ljava/lang/Class;" \ + "I" \ "Ljava/lang/Object;" \ "J" \ "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ + "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ "Ljdk/internal/vm/vector/VectorSupport$VectorMask;" \ "Ljava/lang/Object;" \ "I[II" \ @@ -1240,6 +1244,7 @@ class methodHandle; "Ljava/lang/Class;" \ "I" \ "Ljava/lang/Class;" \ + "I" \ "Ljava/lang/Object;" \ "J" \ "Ljdk/internal/vm/vector/VectorSupport$Vector;" \ diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index bc259eed2d101..2c898b7503258 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -513,6 +513,7 @@ macro(VectorUnbox) macro(VectorMaskWrapper) macro(VectorMaskCmp) macro(VectorMaskCast) +macro(VectorMaskWiden) macro(VectorTest) macro(VectorBlend) macro(VectorRearrange) @@ -535,6 +536,7 @@ macro(VectorUCastS2X) macro(VectorUCastI2X) macro(VectorizedHashCode) macro(VectorInsert) +macro(VectorSlice) macro(MaskAll) macro(AndVMask) macro(OrVMask) diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 783631bf08d89..fb0de99f6e09b 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -722,9 +722,9 @@ bool LibraryCallKit::try_to_inline(int predicate) { case vmIntrinsics::_VectorStoreMaskedOp: return inline_vector_mem_masked_operation(/*is_store=*/true); case vmIntrinsics::_VectorGatherOp: - return inline_vector_gather_scatter(/*is_scatter*/ false); + return inline_vector_gather_scatter(/*is_scatter=*/ false); case vmIntrinsics::_VectorScatterOp: - return inline_vector_gather_scatter(/*is_scatter*/ true); + return inline_vector_gather_scatter(/*is_scatter=*/ true); case vmIntrinsics::_VectorReductionCoerced: return inline_vector_reduction(); case vmIntrinsics::_VectorTest: diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp index ad1ce71c374bf..0bee6dfb0558b 100644 --- a/src/hotspot/share/opto/library_call.hpp +++ b/src/hotspot/share/opto/library_call.hpp @@ -383,6 +383,8 @@ class LibraryCallKit : public GraphKit { bool inline_vector_select_from_two_vectors(); Node* gen_call_to_vector_math(int vector_api_op_id, BasicType bt, int num_elem, Node* opd1, Node* opd2); + Node* gen_gather_load_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, const TypeVect* vector_type); + Node* gen_gather_load_masked_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, Node* mask, const TypeVect* vector_type); enum VectorMaskUseType { VecMaskUseLoad = 1 << 0, diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index e34a43cc1e2f6..95ca44286965a 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2440,6 +2440,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { n->del_req(4); break; } + case Op_VectorSlice: case Op_SelectFromTwoVector: case Op_LoopLimit: { Node* pair1 = new BinaryNode(n->in(1), n->in(2)); @@ -2517,22 +2518,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) { n->del_req(3); break; } - case Op_LoadVectorGather: - if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) { - Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1)); - n->set_req(MemNode::ValueIn, pair); - n->del_req(MemNode::ValueIn+1); - } - break; - case Op_LoadVectorGatherMasked: - if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) { - Node* pair2 = new BinaryNode(n->in(MemNode::ValueIn + 1), n->in(MemNode::ValueIn + 2)); - Node* pair1 = new BinaryNode(n->in(MemNode::ValueIn), pair2); - n->set_req(MemNode::ValueIn, pair1); - n->del_req(MemNode::ValueIn+2); - n->del_req(MemNode::ValueIn+1); - break; - } // fall-through + case Op_LoadVectorGatherMasked: // fall-through case Op_StoreVectorScatter: { Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1)); n->set_req(MemNode::ValueIn, pair); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index baf43b0d5388c..4fe3029046a43 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -343,6 +343,9 @@ class Matcher : public PhaseTransform { static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen); + // Return true if gather/scatter needs vector index as input. + static bool gather_scatter_needs_vector_index(BasicType elem_bt, int vlen); + static const RegMask* predicate_reg_mask(void); // Vector width in bytes @@ -365,6 +368,8 @@ class Matcher : public PhaseTransform { // Vector ideal reg static uint vector_ideal_reg(int len); + // Vector ideal reg size + static uint vector_ideal_reg_size(int len); // Vector length static uint vector_length(const Node* n); diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 1cb9009ef276b..a84df10a7b8f4 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -192,6 +192,7 @@ class StoreVectorScatterNode; class StoreVectorScatterMaskedNode; class VerifyVectorAlignmentNode; class VectorMaskCmpNode; +class VectorMaskWidenNode; class VectorUnboxNode; class VectorSet; class VectorReinterpretNode; @@ -748,6 +749,7 @@ class Node { DEFINE_CLASS_ID(NegV, Vector, 8) DEFINE_CLASS_ID(SaturatingVector, Vector, 9) DEFINE_CLASS_ID(MulVL, Vector, 10) + DEFINE_CLASS_ID(VectorMaskWiden, Vector, 11) DEFINE_CLASS_ID(Con, Type, 8) DEFINE_CLASS_ID(ConI, Con, 0) DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9) @@ -1009,6 +1011,7 @@ class Node { DEFINE_CLASS_QUERY(Type) DEFINE_CLASS_QUERY(Vector) DEFINE_CLASS_QUERY(VectorMaskCmp) + DEFINE_CLASS_QUERY(VectorMaskWiden) DEFINE_CLASS_QUERY(VectorUnbox) DEFINE_CLASS_QUERY(VectorReinterpret) DEFINE_CLASS_QUERY(CompressV) diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index e33d7b1968682..9984e0bf4626c 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1124,18 +1124,144 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) { return true; } -// , -// W extends Vector, -// S extends VectorSpecies, -// M extends VectorMask, -// E> -// V loadWithMap(Class vectorClass, Class maskClass, Class elementType, int length, -// Class> vectorIndexClass, -// Object base, long offset, // Unsafe addressing -// W index_vector, M m, -// C container, int index, int[] indexMap, int indexM, S s, // Arguments for default implementation -// LoadVectorOperationWithMap defaultImpl) +Node* LibraryCallKit::gen_gather_load_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, + Node* indexes3, const TypeVect* vector_type) { + BasicType elem_bt = vector_type->element_basic_type(); + uint elem_num = vector_type->length(); + const TypeVect* index_vect_type = indexes->bottom_type()->isa_vect(); + const TypePtr* addr_type = gvn().type(addr)->isa_ptr(); + Node* addr_mem = memory(addr); + + // The first gather. + Node* vgather = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes)); + + uint index_elem_num = index_vect_type != nullptr ? index_vect_type->length() : 0; + uint vector_reg_size = Matcher::vector_ideal_reg_size(vector_type->length_in_bytes()); + uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt); + // The second gather. + if (indexes1 != nullptr) { + assert(index_vect_type != nullptr, "indexes must be a vector"); + assert(Type::equals(indexes1->bottom_type(), index_vect_type), "invalid vector type"); + Node* vgather1 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes1)); + // Merge the second gather with the first gather result. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num)); + Node* vslice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type)); + } + + // The third and fourth gathers for byte type. + if (indexes2 != nullptr) { + assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather load"); + assert(indexes3 != nullptr, "indexes3 must be non-null"); + assert(Type::equals(indexes2->bottom_type(), index_vect_type), "invalid vector type"); + assert(Type::equals(indexes3->bottom_type(), index_vect_type), "invalid vector type"); + Node* vgather2 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes2)); + // Merge the third gather with previous results. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num)); + Node* vslice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type)); + + Node* vgather3 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes3)); + // Merge the fourth gather with previous results. + idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num)); + vslice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx)); + vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type)); + } + return vgather; +} + +Node* LibraryCallKit::gen_gather_load_masked_subword(Node* addr, Node* indexes, Node* indexes1, + Node* indexes2, Node* indexes3, Node* mask, + const TypeVect* vector_type) { + BasicType elem_bt = vector_type->element_basic_type(); + const TypeVect* index_vect_type = indexes->bottom_type()->isa_vect(); + const TypePtr* addr_type = gvn().type(addr)->isa_ptr(); + Node* addr_mem = memory(addr); + + // Case for architectures that do not support subword vector gather with vector index. + // The mask needs to be kept as it is. + if (index_vect_type == nullptr) { + return gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes, mask)); + } + + // The mask input with subword type needs to be widened to int type. And the element + // number of the new mask is the same as the index vector. + uint index_elem_num = index_vect_type->length(); + const TypeVect* mask_vt = TypeVect::makemask(T_INT, index_elem_num); + Node* vmask_temp = mask; + Node* vmask = nullptr; + + // The first masked vector gather with vector index. Generate a new vector mask by widening + // the lower half of the mask to int type. For byte vector, it maybe the lowest 1/4 part of + // the mask. + if (elem_bt == T_BYTE) { + const TypeVect* mask_vt_s = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT)); + vmask_temp = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_s, /* is_lo */true)); + vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */true)); + } else { + vmask = gvn().transform(new VectorMaskWidenNode(mask, mask_vt, /* is_lo */true)); + } + Node* vgather = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes, vmask)); + + // The second masked vector gather with vector index. + uint vector_reg_size = Matcher::vector_ideal_reg_size(vector_type->length_in_bytes()); + uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt); + if (indexes1 != nullptr) { + assert(index_vect_type != nullptr, "indexes must be a vector"); + assert(Type::equals(indexes1->bottom_type(), index_vect_type), "invalid vector type"); + + // Generate a new vector mask by widening the higher half of the mask to int type. For byte vector, + // it maybe the 2/4 part of the mask starting from the lowest bit. + vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */false)); + Node* vgather1 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes1, vmask)); + // Merge the second gather with the first gather result. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num)); + Node* slice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vector_type)); + } + + // The third and fourth masked vector gathers for byte vector. + if (indexes2 != nullptr) { + assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather load"); + assert(indexes3 != nullptr, "indexes3 must be non-null"); + assert(Type::equals(indexes2->bottom_type(), index_vect_type), "invalid vector type"); + assert(Type::equals(indexes3->bottom_type(), index_vect_type), "invalid vector type"); + + // The third masked vector gather with vector index. The new vector mask is widened from the 3/4 + // part of the input mask. + const TypeVect* mask_vt_s = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT)); + vmask_temp = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_s, /* is_lo */false)); + vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */true)); + Node* vgather2 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes2, vmask)); + // Merge the third gather with previous results. + Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num)); + Node* slice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vector_type)); + + // The fourth masked vector gather with vector index. The new vector mask is widened from the 4/4 + // part of the input mask. + vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */false)); + Node* vgather3 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes3, vmask)); + // Merge the fourth gather with previous results. + idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num)); + slice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx)); + vgather = gvn().transform(new OrVNode(vgather, slice, vector_type)); + } + return vgather; +} + +// +// , +// W extends Vector, +// M extends VectorMask,_ +// E> +// V loadWithMap(Class vClass, Class mClass, Class eClass, int length, +// Class> vectorIndexClass, int indexLength, +// Object base, long offset, +// W indexVector1, W index_vector2, W index_vector3, W index_vector4, +// M m, C container, int index, int[] indexMap, int indexM, S s, +// LoadVectorOperationWithMap defaultImpl) // // , @@ -1143,7 +1269,8 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) { // M extends VectorMask, // E> // void storeWithMap(Class vectorClass, Class maskClass, Class elementType, -// int length, Class> vectorIndexClass, Object base, long offset, // Unsafe addressing +// int length, Class> vectorIndexClass, +// int indexLength, Object base, long offset, // Unsafe addressing // W index_vector, V v, M m, // C container, int index, int[] indexMap, int indexM, // Arguments for default implementation // StoreVectorOperationWithMap defaultImpl) @@ -1154,14 +1281,17 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { const TypeInstPtr* elem_klass = gvn().type(argument(2))->isa_instptr(); const TypeInt* vlen = gvn().type(argument(3))->isa_int(); const TypeInstPtr* vector_idx_klass = gvn().type(argument(4))->isa_instptr(); + const TypeInt* idx_vlen = gvn().type(argument(5))->isa_int(); if (vector_klass == nullptr || elem_klass == nullptr || vector_idx_klass == nullptr || vlen == nullptr || - vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr || vector_idx_klass->const_oop() == nullptr || !vlen->is_con()) { - log_if_needed(" ** missing constant: vclass=%s etype=%s vlen=%s viclass=%s", + idx_vlen == nullptr || vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr || + vector_idx_klass->const_oop() == nullptr || !vlen->is_con() || !idx_vlen->is_con()) { + log_if_needed(" ** missing constant: vclass=%s etype=%s vlen=%s viclass=%s idx_vlen=%s", NodeClassNames[argument(0)->Opcode()], NodeClassNames[argument(2)->Opcode()], NodeClassNames[argument(3)->Opcode()], - NodeClassNames[argument(4)->Opcode()]); + NodeClassNames[argument(4)->Opcode()], + NodeClassNames[argument(5)->Opcode()]); return false; // not enough info for intrinsification } @@ -1178,8 +1308,10 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { BasicType elem_bt = elem_type->basic_type(); int num_elem = vlen->get_con(); + int idx_num_elem = idx_vlen->get_con(); - const Type* vmask_type = gvn().type(is_scatter ? argument(10) : argument(9)); + Node* m = is_scatter ? argument(11) : argument(13); + const Type* vmask_type = gvn().type(m); bool is_masked_op = vmask_type != TypePtr::NULL_PTR; if (is_masked_op) { if (mask_klass == nullptr || mask_klass->const_oop() == nullptr) { @@ -1215,25 +1347,50 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } } - // Check that the vector holding indices is supported by architecture - // For sub-word gathers expander receive index array. - if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) { + bool needs_vector_index = Matcher::gather_scatter_needs_vector_index(elem_bt, num_elem); + if (needs_vector_index) { + // Check that the vector holding indices is supported by architecture + if (!arch_supports_vector(Op_LoadVector, idx_num_elem, T_INT, VecMaskNotUsed)) { log_if_needed(" ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d", - is_scatter, is_scatter ? "scatter" : "gather", - num_elem, is_masked_op ? 1 : 0); + is_scatter, is_scatter ? "scatter" : "gather", + idx_num_elem, is_masked_op ? 1 : 0); return false; // not supported + } + + // Check more ops that are necessary to finish the whole subword gather with vector indexes. + if (!is_scatter && gvn().type(argument(10)) != TypePtr::NULL_PTR) { + if (!arch_supports_vector(Op_VectorSlice, num_elem, elem_bt, VecMaskNotUsed) || + !arch_supports_vector(Op_OrV, num_elem, elem_bt, VecMaskNotUsed)) { + log_if_needed(" ** not supported: op=gather/merge vlen=%d etype=%s is_masked_op=%d", + num_elem, type2name(elem_bt), is_masked_op ? 1 : 0); + return false; // not supported + } + if (is_masked_op && !arch_supports_vector(Op_VectorMaskWiden, idx_num_elem, T_INT, VecMaskNotUsed)) { + log_if_needed(" ** not supported: op=gather/maskwiden vlen=%d etype=%s is_masked_op=1", + idx_num_elem, type2name(elem_bt)); + return false; // not supported + } + } } - Node* base = argument(5); - Node* offset = ConvL2X(argument(6)); + Node* base = argument(6); + Node* offset = ConvL2X(argument(7)); // Save state and restore on bailout uint old_sp = sp(); SafePointNode* old_map = clone_map(); - Node* addr = make_unsafe_address(base, offset, elem_bt, true); + Node* addr = nullptr; + if (needs_vector_index) { + addr = make_unsafe_address(base, offset, elem_bt, true); + } else { + assert(is_subword_type(elem_bt), "Only subword gather operation supports non-vector indexes"); + assert(!is_scatter, "Only supports gather operation for subword types now"); + Node* index = argument(15); + addr = array_element_address(base, index, elem_bt); + } - const TypePtr *addr_type = gvn().type(addr)->isa_ptr(); + const TypePtr* addr_type = gvn().type(addr)->isa_ptr(); const TypeAryPtr* arr_type = addr_type->isa_aryptr(); // The array must be consistent with vector type @@ -1255,26 +1412,66 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { return false; } - Node* index_vect = nullptr; + // Get the indexes for gather/scatter. + Node* indexes = nullptr; const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass); - if (!is_subword_type(elem_bt)) { - index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem); - if (index_vect == nullptr) { + if (!needs_vector_index) { + Node* indexMap = argument(16); + Node* indexM = argument(17); + indexes = array_element_address(indexMap, indexM, T_INT); + } else { + // Get the first index vector. + indexes = unbox_vector(argument(9), vbox_idx_type, T_INT, idx_num_elem); + if (indexes == nullptr) { set_map(old_map); set_sp(old_sp); return false; } } + // Get other index vectors if they are not nullptr for subword gather operation. + Node* indexes1 = nullptr; + Node* indexes2 = nullptr; + Node* indexes3 = nullptr; + if (!is_scatter && needs_vector_index) { + // Get the second index vector if they are not nullptr. + if (gvn().type(argument(10)) != TypePtr::NULL_PTR) { + assert(is_subword_type(elem_bt), "Only subword gather needs more index vectors"); + indexes1 = unbox_vector(argument(10), vbox_idx_type, T_INT, idx_num_elem); + if (indexes1 == nullptr) { + set_map(old_map); + set_sp(old_sp); + return false; + } + } + + // Get the third and fourth index vectors if they are not nullptr. + if (gvn().type(argument(11)) != TypePtr::NULL_PTR) { + assert(elem_bt == T_BYTE, "Only byte gather needs more than 2 index vectors"); + if (gvn().type(argument(12)) == TypePtr::NULL_PTR) { + set_map(old_map); + set_sp(old_sp); + return false; + } + + indexes2 = unbox_vector(argument(11), vbox_idx_type, T_INT, idx_num_elem); + indexes3 = unbox_vector(argument(12), vbox_idx_type, T_INT, idx_num_elem); + if (indexes2 == nullptr || indexes3 == nullptr) { + set_map(old_map); + set_sp(old_sp); + return false; + } + } + } + + // Get the vector mask value. Node* mask = nullptr; if (is_masked_op) { ciKlass* mbox_klass = mask_klass->const_oop()->as_instance()->java_lang_Class_klass(); const TypeInstPtr* mbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, mbox_klass); - mask = unbox_vector(is_scatter ? argument(10) : argument(9), mbox_type, elem_bt, num_elem); + mask = unbox_vector(m, mbox_type, elem_bt, num_elem); if (mask == nullptr) { - log_if_needed(" ** unbox failed mask=%s", - is_scatter ? NodeClassNames[argument(10)->Opcode()] - : NodeClassNames[argument(9)->Opcode()]); + log_if_needed(" ** unbox failed mask=%s", NodeClassNames[m->Opcode()]); set_map(old_map); set_sp(old_sp); return false; @@ -1283,7 +1480,7 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { const TypeVect* vector_type = TypeVect::make(elem_bt, num_elem); if (is_scatter) { - Node* val = unbox_vector(argument(9), vbox_type, elem_bt, num_elem); + Node* val = unbox_vector(argument(10), vbox_type, elem_bt, num_elem); if (val == nullptr) { set_map(old_map); set_sp(old_sp); @@ -1293,29 +1490,24 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { Node* vstore = nullptr; if (mask != nullptr) { - vstore = gvn().transform(new StoreVectorScatterMaskedNode(control(), memory(addr), addr, addr_type, val, index_vect, mask)); + vstore = gvn().transform(new StoreVectorScatterMaskedNode(control(), memory(addr), addr, addr_type, val, indexes, mask)); } else { - vstore = gvn().transform(new StoreVectorScatterNode(control(), memory(addr), addr, addr_type, val, index_vect)); + vstore = gvn().transform(new StoreVectorScatterNode(control(), memory(addr), addr, addr_type, val, indexes)); } set_memory(vstore, addr_type); } else { Node* vload = nullptr; - Node* index = argument(11); - Node* indexMap = argument(12); - Node* indexM = argument(13); if (mask != nullptr) { if (is_subword_type(elem_bt)) { - Node* index_arr_base = array_element_address(indexMap, indexM, T_INT); - vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, mask, index)); + vload = gen_gather_load_masked_subword(addr, indexes, indexes1, indexes2, indexes3, mask, vector_type); } else { - vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask)); + vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, indexes, mask)); } } else { if (is_subword_type(elem_bt)) { - Node* index_arr_base = array_element_address(indexMap, indexM, T_INT); - vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, index)); + vload = gen_gather_load_subword(addr, indexes, indexes1, indexes2, indexes3, vector_type); } else { - vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect)); + vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, indexes)); } } Node* box = box_vector(vload, vbox_type, elem_bt, num_elem); @@ -1323,7 +1515,6 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) { } destruct_map_clone(old_map); - C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt)))); return true; } diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 084b70a690653..94cdfdc6cea4d 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1041,19 +1041,31 @@ Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const Type uint vlen = vt->length(); BasicType bt = vt->element_basic_type(); + BasicType mask_bt = bt; + uint mask_vlen = vlen; + if (vopc == Op_LoadVectorGather && is_subword_type(bt)) { + // It uses the index vector's type as the mask type for subword gather load. + const TypeVect* index_vt = node->in(MemNode::ValueIn)->bottom_type()->isa_vect(); + if (index_vt == nullptr) { + return nullptr; + } + mask_bt = index_vt->element_basic_type(); + mask_vlen = index_vt->length(); + } + // Predicated vectors do not need to add another mask input if (node->is_predicated_vector() || !Matcher::has_predicated_vectors() || !Matcher::match_rule_supported_vector_masked(vopc, vlen, bt) || - !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt)) { + !Matcher::match_rule_supported_vector(Op_VectorMaskGen, mask_vlen, mask_bt)) { return nullptr; } Node* mask = nullptr; // Generate a vector mask for vector operation whose vector length is lower than the // hardware supported max vector length. - if (vt->length_in_bytes() < (uint)MaxVectorSize) { + if (mask_vlen * type2aelembytes(mask_bt) < (uint)MaxVectorSize) { Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(vlen)))); - mask = gvn->transform(VectorMaskGenNode::make(length, bt, vlen)); + mask = gvn->transform(VectorMaskGenNode::make(length, mask_bt, mask_vlen)); } else { return nullptr; } diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 36706a7b7a14b..eac2c9bc9cc6b 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1112,25 +1112,18 @@ class LoadVectorNode : public LoadNode { // Load Vector from memory via index map class LoadVectorGatherNode : public LoadVectorNode { public: - LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* offset = nullptr) + LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices) : LoadVectorNode(c, mem, adr, at, vt) { init_class_id(Class_LoadVectorGather); add_req(indices); DEBUG_ONLY(bool is_subword = is_subword_type(vt->element_basic_type())); assert(is_subword || indices->bottom_type()->is_vect(), "indices must be in vector"); - assert(is_subword || !offset, ""); assert(req() == MemNode::ValueIn + 1, "match_edge expects that index input is in MemNode::ValueIn"); - if (offset) { - add_req(offset); - } } virtual int Opcode() const; virtual uint match_edge(uint idx) const { - return idx == MemNode::Address || - idx == MemNode::ValueIn || - ((is_subword_type(vect_type()->element_basic_type())) && - idx == MemNode::ValueIn + 1); + return idx == MemNode::Address || idx == MemNode::ValueIn; } virtual int store_Opcode() const { // Ensure it is different from any store opcode to avoid folding when indices are used @@ -1249,23 +1242,19 @@ class LoadVectorMaskedNode : public LoadVectorNode { // Load Vector from memory via index map under the influence of a predicate register(mask). class LoadVectorGatherMaskedNode : public LoadVectorNode { public: - LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask, Node* offset = nullptr) + LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask) : LoadVectorNode(c, mem, adr, at, vt) { init_class_id(Class_LoadVectorGatherMasked); add_req(indices); add_req(mask); assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1"); - if (is_subword_type(vt->element_basic_type())) { - add_req(offset); - } + assert(is_subword_type(vt->element_basic_type()) || indices->bottom_type()->is_vect(), "indices must be in vector"); } virtual int Opcode() const; virtual uint match_edge(uint idx) const { return idx == MemNode::Address || idx == MemNode::ValueIn || - idx == MemNode::ValueIn + 1 || - (is_subword_type(vect_type()->is_vect()->element_basic_type()) && - idx == MemNode::ValueIn + 2); } + idx == MemNode::ValueIn + 1; } virtual int store_Opcode() const { // Ensure it is different from any store opcode to avoid folding when indices and mask are used return -1; @@ -1745,6 +1734,24 @@ class VectorRearrangeNode : public VectorNode { Node* vec_shuffle() const { return in(2); } }; +// Generate a vector by slicing the two source vectors based on an index. +// +// Copy the indexed byte up to the last byte of the first source vector +// to the bottom of the result vector, then fill the remainder of the +// result starting from the first byte of the second source vector. +// +// E.g. src1 = [hgfedcba] src2 = [ponmlkji] index = 3 +// dst = [kjihgfed] +class VectorSliceNode : public VectorNode { + public: + VectorSliceNode(Node* vec1, Node* vec2, Node* index) + : VectorNode(vec1, vec2, index, vec1->bottom_type()->is_vect()) { + assert(index->bottom_type()->isa_int(), "index must be an integral value"); + assert(index->is_Con(), "index must be a constant"); + } + + virtual int Opcode() const; +}; // Select elements from two source vectors based on the wrapped indexes held in // the first vector. @@ -1804,6 +1811,28 @@ class VectorMaskCastNode : public VectorNode { virtual int Opcode() const; }; +// Unpack the elements to twice size. +class VectorMaskWidenNode : public VectorNode { + private: + // "_is_lo" is used to denote whether the lower half or + // the upper half of the elements are widened. + // E.g. src = [1111 0101] + // _is_lo = true, dst = [0001 0001] + // _is_lo = false, dst = [0101 0101] + bool _is_lo; + + public: + VectorMaskWidenNode(Node* in, const TypeVect* vt, bool is_lo) : VectorNode(in, vt), _is_lo(is_lo) { + init_class_id(Class_VectorMaskWiden); + const TypeVect* in_vt = in->bottom_type()->is_vect(); + assert(type2aelembytes(in_vt->element_basic_type()) == type2aelembytes(vt->element_basic_type()) / 2, "must be half size"); + } + + bool is_lo() const { return _is_lo; } + virtual int Opcode() const; + virtual uint size_of() const { return sizeof(*this); } +}; + // This is intended for use as a simple reinterpret node that has no cast. class VectorReinterpretNode : public VectorNode { private: diff --git a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java index cbf30da228934..0808f37d2370c 100644 --- a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java +++ b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -451,8 +451,8 @@ public interface LoadVectorOperationWithMap vClass, Class mClass, Class eClass, int length, Class> vectorIndexClass, - Object base, long offset, - W index_vector, + int indexLength, Object base, long offset, + W indexVector1, W indexVector2, W indexVector3, W indexVector4, M m, C container, int index, int[] indexMap, int indexM, S s, LoadVectorOperationWithMap defaultImpl) { assert isNonCapturingLambda(defaultImpl) : defaultImpl; @@ -518,7 +518,7 @@ public interface StoreVectorOperationWithMap vClass, Class mClass, Class eClass, int length, Class> vectorIndexClass, - Object base, long offset, + int indexLength, Object base, long offset, W index_vector, V v, M m, C container, int index, int[] indexMap, int indexM, StoreVectorOperationWithMap defaultImpl) { diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java index ed8d273ff37db..5e608807b0564 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java @@ -3117,17 +3117,30 @@ ByteVector fromArray(VectorSpecies species, } // Check indices are within array bounds. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); + } + + IntVector vix2 = null; + IntVector vix3 = null; + if (vlen == idx_vlen * 4) { + vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset); + VectorIntrinsics.checkIndex(vix2, a.length); + vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset); + VectorIntrinsics.checkIndex(vix3, a.length); } return VectorSupport.loadWithMap( vectorType, null, byte.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, null, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, vix2, vix3, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3879,17 +3892,30 @@ ByteVector fromArray0Template(Class maskClass, byte[] a, int offset, // Check indices are within array bounds. // FIXME: Check index under mask controlling. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); + } + + IntVector vix2 = null; + IntVector vix3 = null; + if (vlen == idx_vlen * 4) { + vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset); + VectorIntrinsics.checkIndex(vix2, a.length); + vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset); + VectorIntrinsics.checkIndex(vix3, a.length); } return VectorSupport.loadWithMap( vectorType, maskClass, byte.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, m, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, vix2, vix3, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java index 5fbf02f87bd93..5f9c6f481b8a0 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java @@ -2910,8 +2910,8 @@ DoubleVector fromArray(VectorSpecies species, return VectorSupport.loadWithMap( vectorType, null, double.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, null, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3201,7 +3201,7 @@ void intoArray(double[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, null, a, offset, indexMap, mapOffset, @@ -3396,8 +3396,8 @@ DoubleVector fromArray0Template(Class maskClass, double[] a, int offset, return VectorSupport.loadWithMap( vectorType, maskClass, double.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, m, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); @@ -3512,7 +3512,7 @@ void intoArray0Template(Class maskClass, double[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, m, a, offset, indexMap, mapOffset, diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java index 26fbe64742d6f..9ba457fa55668 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java @@ -2916,8 +2916,8 @@ FloatVector fromArray(VectorSpecies species, return VectorSupport.loadWithMap( vectorType, null, float.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, null, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3188,7 +3188,7 @@ void intoArray(float[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, null, a, offset, indexMap, mapOffset, @@ -3365,8 +3365,8 @@ FloatVector fromArray0Template(Class maskClass, float[] a, int offset, return VectorSupport.loadWithMap( vectorType, maskClass, float.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, m, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); @@ -3462,7 +3462,7 @@ void intoArray0Template(Class maskClass, float[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, m, a, offset, indexMap, mapOffset, diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java index 076a66ed6a543..b3d4c938e8e00 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java @@ -3094,8 +3094,8 @@ IntVector fromArray(VectorSpecies species, return VectorSupport.loadWithMap( vectorType, null, int.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, null, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3366,7 +3366,7 @@ void intoArray(int[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, null, a, offset, indexMap, mapOffset, @@ -3543,8 +3543,8 @@ IntVector fromArray0Template(Class maskClass, int[] a, int offset, return VectorSupport.loadWithMap( vectorType, maskClass, int.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, m, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); @@ -3640,7 +3640,7 @@ void intoArray0Template(Class maskClass, int[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, m, a, offset, indexMap, mapOffset, diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java index 21903aa6794e8..7c42bac59d49d 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java @@ -2973,8 +2973,8 @@ LongVector fromArray(VectorSpecies species, return VectorSupport.loadWithMap( vectorType, null, long.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, null, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3264,7 +3264,7 @@ void intoArray(long[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, null, a, offset, indexMap, mapOffset, @@ -3459,8 +3459,8 @@ LongVector fromArray0Template(Class maskClass, long[] a, int offset, return VectorSupport.loadWithMap( vectorType, maskClass, long.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, m, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); @@ -3575,7 +3575,7 @@ void intoArray0Template(Class maskClass, long[] a, int offset, VectorSupport.storeWithMap( vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, m, a, offset, indexMap, mapOffset, diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java index 0bb97da824459..7ae07f1682161 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java @@ -3118,17 +3118,21 @@ ShortVector fromArray(VectorSpecies species, } // Check indices are within array bounds. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); } return VectorSupport.loadWithMap( vectorType, null, short.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, null, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -3865,17 +3869,21 @@ ShortVector fromArray0Template(Class maskClass, short[] a, int offset, // Check indices are within array bounds. // FIXME: Check index under mask controlling. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); } return VectorSupport.loadWithMap( vectorType, maskClass, short.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, m, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template index 8084cc307e867..5113738a23261 100644 --- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template +++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template @@ -3724,20 +3724,43 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { } // Check indices are within array bounds. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); + } + +#if[byte] + IntVector vix2 = null; + IntVector vix3 = null; + if (vlen == idx_vlen * 4) { + vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset); + VectorIntrinsics.checkIndex(vix2, a.length); + vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset); + VectorIntrinsics.checkIndex(vix3, a.length); } return VectorSupport.loadWithMap( vectorType, null, $type$.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, null, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, vix2, vix3, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); +#else[byte] + return VectorSupport.loadWithMap( + vectorType, null, $type$.class, vsp.laneCount(), + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, null, null, null, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(n -> c[idx + iMap[idy+n]])); +#end[byte] } #else[byteOrShort] @ForceInline @@ -3785,8 +3808,8 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { return VectorSupport.loadWithMap( vectorType, null, $type$.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, null, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, null, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(n -> c[idx + iMap[idy+n]])); @@ -4411,7 +4434,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { VectorSupport.storeWithMap( vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, null, a, offset, indexMap, mapOffset, @@ -4932,20 +4955,43 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { // Check indices are within array bounds. // FIXME: Check index under mask controlling. - for (int i = 0; i < vsp.length(); i += lsp.length()) { - IntVector vix = IntVector - .fromArray(lsp, indexMap, mapOffset + i) - .add(offset); - VectorIntrinsics.checkIndex(vix, a.length); + IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset); + VectorIntrinsics.checkIndex(vix0, a.length); + + int vlen = vsp.length(); + int idx_vlen = lsp.length(); + IntVector vix1 = null; + if (vlen >= idx_vlen * 2) { + vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset); + VectorIntrinsics.checkIndex(vix1, a.length); + } + +#if[byte] + IntVector vix2 = null; + IntVector vix3 = null; + if (vlen == idx_vlen * 4) { + vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset); + VectorIntrinsics.checkIndex(vix2, a.length); + vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset); + VectorIntrinsics.checkIndex(vix3, a.length); } return VectorSupport.loadWithMap( vectorType, maskClass, $type$.class, vsp.laneCount(), - lsp.vectorType(), - a, ARRAY_BASE, null, m, + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, vix2, vix3, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); +#else[byte] + return VectorSupport.loadWithMap( + vectorType, maskClass, $type$.class, vsp.laneCount(), + lsp.vectorType(), lsp.length(), + a, ARRAY_BASE, vix0, vix1, null, null, m, + a, offset, indexMap, mapOffset, vsp, + (c, idx, iMap, idy, s, vm) -> + s.vOp(vm, n -> c[idx + iMap[idy+n]])); +#end[byte] } #else[byteOrShort] @ForceInline @@ -4995,8 +5041,8 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { return VectorSupport.loadWithMap( vectorType, maskClass, $type$.class, vsp.laneCount(), - isp.vectorType(), - a, ARRAY_BASE, vix, m, + isp.vectorType(), isp.length(), + a, ARRAY_BASE, vix, null, null, null, m, a, offset, indexMap, mapOffset, vsp, (c, idx, iMap, idy, s, vm) -> s.vOp(vm, n -> c[idx + iMap[idy+n]])); @@ -5186,7 +5232,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> { VectorSupport.storeWithMap( vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(), - isp.vectorType(), + isp.vectorType(), isp.length(), a, arrayAddress(a, 0), vix, this, m, a, offset, indexMap, mapOffset, diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py index 92868e783dcfe..64a209c36b00e 100644 --- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py +++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py @@ -2063,6 +2063,8 @@ def generate(kind, names): ["index", "__ sve_index(z7, __ D, r5, 5);", "index\tz7.d, x5, #5"], ["cpy", "__ sve_cpy(z7, __ H, p3, r5);", "cpy\tz7.h, p3/m, w5"], ["tbl", "__ sve_tbl(z16, __ S, z17, z18);", "tbl\tz16.s, {z17.s}, z18.s"], + ["ld1b", "__ sve_ld1b_gather(z15, p0, r5, z16);", "ld1b\t{z15.s}, p0/z, [x5, z16.s, uxtw]"], + ["ld1h", "__ sve_ld1h_gather(z15, p0, r5, z16);", "ld1h\t{z15.s}, p0/z, [x5, z16.s, uxtw #1]"], ["ld1w", "__ sve_ld1w_gather(z15, p0, r5, z16);", "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"], ["ld1d", "__ sve_ld1d_gather(z15, p0, r5, z16);", "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"], ["st1w", "__ sve_st1w_scatter(z15, p0, r5, z16);", "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"], diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h index 0c2011592b6f4..562d8d9adb7f1 100644 --- a/test/hotspot/gtest/aarch64/asmtest.out.h +++ b/test/hotspot/gtest/aarch64/asmtest.out.h @@ -1049,6 +1049,8 @@ __ sve_index(z7, __ D, r5, 5); // index z7.d, x5, #5 __ sve_cpy(z7, __ H, p3, r5); // cpy z7.h, p3/m, w5 __ sve_tbl(z16, __ S, z17, z18); // tbl z16.s, {z17.s}, z18.s + __ sve_ld1b_gather(z15, p0, r5, z16); // ld1b {z15.s}, p0/z, [x5, z16.s, uxtw] + __ sve_ld1h_gather(z15, p0, r5, z16); // ld1h {z15.s}, p0/z, [x5, z16.s, uxtw #1] __ sve_ld1w_gather(z15, p0, r5, z16); // ld1w {z15.s}, p0/z, [x5, z16.s, uxtw #2] __ sve_ld1d_gather(z15, p0, r5, z16); // ld1d {z15.d}, p0/z, [x5, z16.d, uxtw #3] __ sve_st1w_scatter(z15, p0, r5, z16); // st1w {z15.s}, p0, [x5, z16.s, uxtw #2] @@ -1387,30 +1389,30 @@ 0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061, 0x120cb166, 0x321764bc, 0x52174681, 0x720c0227, 0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01, - 0x14000000, 0x17ffffd7, 0x1400047d, 0x94000000, - 0x97ffffd4, 0x9400047a, 0x3400000a, 0x34fffa2a, - 0x34008eea, 0x35000008, 0x35fff9c8, 0x35008e88, - 0xb400000b, 0xb4fff96b, 0xb4008e2b, 0xb500001d, - 0xb5fff91d, 0xb5008ddd, 0x10000013, 0x10fff8b3, - 0x10008d73, 0x90000013, 0x36300016, 0x3637f836, - 0x36308cf6, 0x3758000c, 0x375ff7cc, 0x37588c8c, + 0x14000000, 0x17ffffd7, 0x1400047f, 0x94000000, + 0x97ffffd4, 0x9400047c, 0x3400000a, 0x34fffa2a, + 0x34008f2a, 0x35000008, 0x35fff9c8, 0x35008ec8, + 0xb400000b, 0xb4fff96b, 0xb4008e6b, 0xb500001d, + 0xb5fff91d, 0xb5008e1d, 0x10000013, 0x10fff8b3, + 0x10008db3, 0x90000013, 0x36300016, 0x3637f836, + 0x36308d36, 0x3758000c, 0x375ff7cc, 0x37588ccc, 0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc, 0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f, 0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016, 0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0, - 0x54008a60, 0x54000001, 0x54fff541, 0x54008a01, - 0x54000002, 0x54fff4e2, 0x540089a2, 0x54000002, - 0x54fff482, 0x54008942, 0x54000003, 0x54fff423, - 0x540088e3, 0x54000003, 0x54fff3c3, 0x54008883, - 0x54000004, 0x54fff364, 0x54008824, 0x54000005, - 0x54fff305, 0x540087c5, 0x54000006, 0x54fff2a6, - 0x54008766, 0x54000007, 0x54fff247, 0x54008707, - 0x54000008, 0x54fff1e8, 0x540086a8, 0x54000009, - 0x54fff189, 0x54008649, 0x5400000a, 0x54fff12a, - 0x540085ea, 0x5400000b, 0x54fff0cb, 0x5400858b, - 0x5400000c, 0x54fff06c, 0x5400852c, 0x5400000d, - 0x54fff00d, 0x540084cd, 0x5400000e, 0x54ffefae, - 0x5400846e, 0x5400000f, 0x54ffef4f, 0x5400840f, + 0x54008aa0, 0x54000001, 0x54fff541, 0x54008a41, + 0x54000002, 0x54fff4e2, 0x540089e2, 0x54000002, + 0x54fff482, 0x54008982, 0x54000003, 0x54fff423, + 0x54008923, 0x54000003, 0x54fff3c3, 0x540088c3, + 0x54000004, 0x54fff364, 0x54008864, 0x54000005, + 0x54fff305, 0x54008805, 0x54000006, 0x54fff2a6, + 0x540087a6, 0x54000007, 0x54fff247, 0x54008747, + 0x54000008, 0x54fff1e8, 0x540086e8, 0x54000009, + 0x54fff189, 0x54008689, 0x5400000a, 0x54fff12a, + 0x5400862a, 0x5400000b, 0x54fff0cb, 0x540085cb, + 0x5400000c, 0x54fff06c, 0x5400856c, 0x5400000d, + 0x54fff00d, 0x5400850d, 0x5400000e, 0x54ffefae, + 0x540084ae, 0x5400000f, 0x54ffef4f, 0x5400844f, 0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60, 0xd44cad80, 0xd503201f, 0xd503203f, 0xd503205f, 0xd503209f, 0xd50320bf, 0xd503219f, 0xd50323bf, @@ -1604,76 +1606,77 @@ 0x659ca509, 0x65d8a801, 0x65dcac01, 0x655cb241, 0x0520a1e0, 0x0521a601, 0x052281e0, 0x05238601, 0x04a14026, 0x042244a6, 0x046344a6, 0x04a444a6, - 0x04e544a7, 0x0568aca7, 0x05b23230, 0x853040af, - 0xc5b040af, 0xe57080af, 0xe5b080af, 0x25034440, - 0x254054c4, 0x25034640, 0x25415a05, 0x25834440, - 0x25c54489, 0x250b5d3a, 0x2550dc20, 0x2518e3e1, - 0x2518e021, 0x2518e0a1, 0x2518e121, 0x2518e1a1, - 0x2558e3e2, 0x2558e042, 0x2558e0c2, 0x2558e142, - 0x2598e3e3, 0x2598e063, 0x2598e0e3, 0x2598e163, - 0x25d8e3e4, 0x25d8e084, 0x25d8e104, 0x25d8e184, - 0x2518e407, 0x05214800, 0x05614800, 0x05a14800, - 0x05e14800, 0x05214c00, 0x05614c00, 0x05a14c00, - 0x05e14c00, 0x05304001, 0x05314001, 0x05a18610, - 0x05e18610, 0x05271e11, 0x6545e891, 0x6585e891, - 0x65c5e891, 0x6545c891, 0x6585c891, 0x65c5c891, - 0x45b0c210, 0x45f1c231, 0x1e601000, 0x1e603000, - 0x1e621000, 0x1e623000, 0x1e641000, 0x1e643000, - 0x1e661000, 0x1e663000, 0x1e681000, 0x1e683000, - 0x1e6a1000, 0x1e6a3000, 0x1e6c1000, 0x1e6c3000, - 0x1e6e1000, 0x1e6e3000, 0x1e701000, 0x1e703000, - 0x1e721000, 0x1e723000, 0x1e741000, 0x1e743000, - 0x1e761000, 0x1e763000, 0x1e781000, 0x1e783000, - 0x1e7a1000, 0x1e7a3000, 0x1e7c1000, 0x1e7c3000, - 0x1e7e1000, 0x1e7e3000, 0xf82081f1, 0xf824011a, - 0xf83c1376, 0xf83b22f9, 0xf82030c4, 0xf8305080, - 0xf82f4141, 0xf8277145, 0xf83c6287, 0xf8b780d5, - 0xf8ab0228, 0xf8bf1226, 0xf8a223cc, 0xf8bd3363, - 0xf8b651dd, 0xf8ad423c, 0xf8b87045, 0xf8ae620a, - 0xf8eb82fb, 0xf8ec02c4, 0xf8f11024, 0xf8f321f0, - 0xf8ed318e, 0xf8e25071, 0xf8f540b7, 0xf8e67267, - 0xf8ed623c, 0xf8708046, 0xf87d0083, 0xf8661290, - 0xf86d228c, 0xf8683299, 0xf8735160, 0xf8784286, - 0xf87f720e, 0xf86660e0, 0xb82f8353, 0xb82902ea, - 0xb8351396, 0xb82221e3, 0xb83330f4, 0xb82450fd, - 0xb8204209, 0xb8347097, 0xb83062ea, 0xb8ab80d9, - 0xb8bf01b0, 0xb8b7102c, 0xb8ae22a9, 0xb8b031fa, - 0xb8a451e4, 0xb8a843c6, 0xb8a4723d, 0xb8bd613a, - 0xb8ef8162, 0xb8fd00e3, 0xb8e112bb, 0xb8f0210e, - 0xb8f03336, 0xb8e552b4, 0xb8f04217, 0xb8fe7294, - 0xb8e06264, 0xb8788284, 0xb8640358, 0xb8731102, - 0xb868230e, 0xb87032df, 0xb864503f, 0xb86a4194, - 0xb86070e9, 0xb8786090, 0xce2a6cdb, 0xce107db8, - 0xce748ed6, 0xce8973bf, 0xce7480f4, 0xce6b853c, - 0xcec0818e, 0xce788834, 0x25a0cd89, 0x25a1d093, - 0x05803685, 0x05400c08, 0x050074c4, 0x2560d6a0, - 0x2521c0fb, 0x05805089, 0x05403e98, 0x05025238, - 0x25e0cd0b, 0x25e1d1d2, 0x05800e4e, 0x05402676, - 0x05001e63, 0x25a0d1c9, 0x2521c495, 0x0583abe2, - 0x054011ab, 0x05007cbe, 0x2560c3b7, 0x25e1c358, - 0x05806593, 0x054064b5, 0x05000e5a, 0x2520c3f1, - 0x25a1cc29, 0x05801468, 0x05401d71, 0x05035bb2, - 0x04bb01f0, 0x046806dc, 0x659c0385, 0x65d909e0, - 0x65c30415, 0x04fa10ba, 0x04611a33, 0x042e17ce, - 0x04bf1c52, 0x0456b7d7, 0x04400008, 0x049a1417, - 0x04509b1a, 0x041b1456, 0x0499b58b, 0x04dab938, - 0x04991691, 0x04d395a4, 0x04d19ff6, 0x045011f2, - 0x0417be8d, 0x041eadc1, 0x04980987, 0x052799e4, - 0x05a49c23, 0x04c817e5, 0x044a0d2d, 0x04c901fe, - 0x044b0343, 0x04c10839, 0x04dcac2a, 0x65c087ba, - 0x658d8791, 0x65869d61, 0x65c78021, 0x65828c5b, - 0x049db33e, 0x65c2b862, 0x65c0ac7d, 0x65c1b38e, - 0x65cdab64, 0x65c19022, 0x65fc97e7, 0x65bd162a, - 0x65b82596, 0x65a0a969, 0x65a4d697, 0x65feec8f, - 0x65ba46bb, 0x65a4633f, 0x04c742a6, 0x049f7f18, - 0x042c3141, 0x04b9310d, 0x047733e1, 0x04f53014, - 0x05bb6bbf, 0x05ba6fa8, 0x65c88645, 0x4555b34d, - 0x45cab660, 0x043138c7, 0x44589b94, 0x445a8e71, - 0x44198b1a, 0x449b8f8b, 0x049a3797, 0x04183f14, - 0x045926fb, 0x04c825ac, 0x040a369a, 0x65873fa2, - 0x6586347d, 0x65982b85, 0x04412dd1, 0x0e2c116a, - 0x4e2a1128, 0x0e6b1149, 0x4e751293, 0x0ea21020, - 0x4ebf13dd, 0x2e321230, 0x6e321230, 0x2e6f11cd, - 0x6e791317, 0x2eba1338, 0x6eb91317, + 0x04e544a7, 0x0568aca7, 0x05b23230, 0x841040af, + 0x84b040af, 0x853040af, 0xc5b040af, 0xe57080af, + 0xe5b080af, 0x25034440, 0x254054c4, 0x25034640, + 0x25415a05, 0x25834440, 0x25c54489, 0x250b5d3a, + 0x2550dc20, 0x2518e3e1, 0x2518e021, 0x2518e0a1, + 0x2518e121, 0x2518e1a1, 0x2558e3e2, 0x2558e042, + 0x2558e0c2, 0x2558e142, 0x2598e3e3, 0x2598e063, + 0x2598e0e3, 0x2598e163, 0x25d8e3e4, 0x25d8e084, + 0x25d8e104, 0x25d8e184, 0x2518e407, 0x05214800, + 0x05614800, 0x05a14800, 0x05e14800, 0x05214c00, + 0x05614c00, 0x05a14c00, 0x05e14c00, 0x05304001, + 0x05314001, 0x05a18610, 0x05e18610, 0x05271e11, + 0x6545e891, 0x6585e891, 0x65c5e891, 0x6545c891, + 0x6585c891, 0x65c5c891, 0x45b0c210, 0x45f1c231, + 0x1e601000, 0x1e603000, 0x1e621000, 0x1e623000, + 0x1e641000, 0x1e643000, 0x1e661000, 0x1e663000, + 0x1e681000, 0x1e683000, 0x1e6a1000, 0x1e6a3000, + 0x1e6c1000, 0x1e6c3000, 0x1e6e1000, 0x1e6e3000, + 0x1e701000, 0x1e703000, 0x1e721000, 0x1e723000, + 0x1e741000, 0x1e743000, 0x1e761000, 0x1e763000, + 0x1e781000, 0x1e783000, 0x1e7a1000, 0x1e7a3000, + 0x1e7c1000, 0x1e7c3000, 0x1e7e1000, 0x1e7e3000, + 0xf82081f1, 0xf824011a, 0xf83c1376, 0xf83b22f9, + 0xf82030c4, 0xf8305080, 0xf82f4141, 0xf8277145, + 0xf83c6287, 0xf8b780d5, 0xf8ab0228, 0xf8bf1226, + 0xf8a223cc, 0xf8bd3363, 0xf8b651dd, 0xf8ad423c, + 0xf8b87045, 0xf8ae620a, 0xf8eb82fb, 0xf8ec02c4, + 0xf8f11024, 0xf8f321f0, 0xf8ed318e, 0xf8e25071, + 0xf8f540b7, 0xf8e67267, 0xf8ed623c, 0xf8708046, + 0xf87d0083, 0xf8661290, 0xf86d228c, 0xf8683299, + 0xf8735160, 0xf8784286, 0xf87f720e, 0xf86660e0, + 0xb82f8353, 0xb82902ea, 0xb8351396, 0xb82221e3, + 0xb83330f4, 0xb82450fd, 0xb8204209, 0xb8347097, + 0xb83062ea, 0xb8ab80d9, 0xb8bf01b0, 0xb8b7102c, + 0xb8ae22a9, 0xb8b031fa, 0xb8a451e4, 0xb8a843c6, + 0xb8a4723d, 0xb8bd613a, 0xb8ef8162, 0xb8fd00e3, + 0xb8e112bb, 0xb8f0210e, 0xb8f03336, 0xb8e552b4, + 0xb8f04217, 0xb8fe7294, 0xb8e06264, 0xb8788284, + 0xb8640358, 0xb8731102, 0xb868230e, 0xb87032df, + 0xb864503f, 0xb86a4194, 0xb86070e9, 0xb8786090, + 0xce2a6cdb, 0xce107db8, 0xce748ed6, 0xce8973bf, + 0xce7480f4, 0xce6b853c, 0xcec0818e, 0xce788834, + 0x25a0cd89, 0x25a1d093, 0x05803685, 0x05400c08, + 0x050074c4, 0x2560d6a0, 0x2521c0fb, 0x05805089, + 0x05403e98, 0x05025238, 0x25e0cd0b, 0x25e1d1d2, + 0x05800e4e, 0x05402676, 0x05001e63, 0x25a0d1c9, + 0x2521c495, 0x0583abe2, 0x054011ab, 0x05007cbe, + 0x2560c3b7, 0x25e1c358, 0x05806593, 0x054064b5, + 0x05000e5a, 0x2520c3f1, 0x25a1cc29, 0x05801468, + 0x05401d71, 0x05035bb2, 0x04bb01f0, 0x046806dc, + 0x659c0385, 0x65d909e0, 0x65c30415, 0x04fa10ba, + 0x04611a33, 0x042e17ce, 0x04bf1c52, 0x0456b7d7, + 0x04400008, 0x049a1417, 0x04509b1a, 0x041b1456, + 0x0499b58b, 0x04dab938, 0x04991691, 0x04d395a4, + 0x04d19ff6, 0x045011f2, 0x0417be8d, 0x041eadc1, + 0x04980987, 0x052799e4, 0x05a49c23, 0x04c817e5, + 0x044a0d2d, 0x04c901fe, 0x044b0343, 0x04c10839, + 0x04dcac2a, 0x65c087ba, 0x658d8791, 0x65869d61, + 0x65c78021, 0x65828c5b, 0x049db33e, 0x65c2b862, + 0x65c0ac7d, 0x65c1b38e, 0x65cdab64, 0x65c19022, + 0x65fc97e7, 0x65bd162a, 0x65b82596, 0x65a0a969, + 0x65a4d697, 0x65feec8f, 0x65ba46bb, 0x65a4633f, + 0x04c742a6, 0x049f7f18, 0x042c3141, 0x04b9310d, + 0x047733e1, 0x04f53014, 0x05bb6bbf, 0x05ba6fa8, + 0x65c88645, 0x4555b34d, 0x45cab660, 0x043138c7, + 0x44589b94, 0x445a8e71, 0x44198b1a, 0x449b8f8b, + 0x049a3797, 0x04183f14, 0x045926fb, 0x04c825ac, + 0x040a369a, 0x65873fa2, 0x6586347d, 0x65982b85, + 0x04412dd1, 0x0e2c116a, 0x4e2a1128, 0x0e6b1149, + 0x4e751293, 0x0ea21020, 0x4ebf13dd, 0x2e321230, + 0x6e321230, 0x2e6f11cd, 0x6e791317, 0x2eba1338, + 0x6eb91317, }; // END Generated code -- do not edit diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java new file mode 100644 index 0000000000000..63a35db50a5a3 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import jdk.test.lib.Asserts; + +/** + * @test + * @bug 8351623 + * @summary VectorAPI: Refactor subword gather load and add SVE implementation + * @key randomness + * @library /test/lib / + * @modules jdk.incubator.vector + * + * @run driver compiler.vectorapi.VectorGatherSubwordTest + */ +public class VectorGatherSubwordTest { + private static final VectorSpecies B_SPECIES = ByteVector.SPECIES_MAX; + private static final VectorSpecies S_SPECIES = ShortVector.SPECIES_MAX; + + private static int LENGTH = 128; + private static final Generators random = Generators.G; + + private static byte[] ba; + private static byte[] br; + private static short[] sa; + private static short[] sr; + private static boolean[] m; + private static int[][] indexes; + + static { + ba = new byte[LENGTH]; + br = new byte[LENGTH]; + sa = new short[LENGTH]; + sr = new short[LENGTH]; + m = new boolean[LENGTH]; + indexes = new int[2][]; + + Generator byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE); + Generator shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE); + for (int i = 0; i < LENGTH; i++) { + ba[i] = byteGen.next().byteValue(); + sa[i] = shortGen.next().shortValue(); + m[i] = i % 2 == 0; + } + + int[] nums = {B_SPECIES.length(), S_SPECIES.length()}; + for (int i = 0; i < 2; i++) { + indexes[i] = new int[nums[i]]; + random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]); + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0) + .intoArray(br, i); + } + } + + @Check(test = "testLoadGatherByte") + public void verifyLoadGatherByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + for (int j = 0; j < B_SPECIES.length(); j++) { + Asserts.assertEquals(ba[i + indexes[0][j]], br[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0) + .intoArray(sr, i); + } + } + + @Check(test = "testLoadGatherShort") + public void verifyLoadGatherShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + for (int j = 0; j < S_SPECIES.length(); j++) { + Asserts.assertEquals(sa[i + indexes[1][j]], sr[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherMaskedByte() { + VectorMask mask = VectorMask.fromArray(B_SPECIES, m, 0); + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0, mask) + .intoArray(br, i); + } + } + + @Check(test = "testLoadGatherMaskedByte") + public void verifyLoadGatherMaskedByte() { + for (int i = 0; i < LENGTH; i += B_SPECIES.length()) { + for (int j = 0; j < B_SPECIES.length(); j++) { + Asserts.assertEquals(m[j] ? ba[i + indexes[0][j]] : 0, br[i + j]); + } + } + } + + @Test + @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"}) + public void testLoadGatherMaskedShort() { + VectorMask mask = VectorMask.fromArray(S_SPECIES, m, 0); + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0, mask) + .intoArray(sr, i); + } + } + + @Check(test = "testLoadGatherMaskedShort") + public void verifyLoadGatherMaskedShort() { + for (int i = 0; i < LENGTH; i += S_SPECIES.length()) { + for (int j = 0; j < S_SPECIES.length(); j++) { + Asserts.assertEquals(m[j] ? sa[i + indexes[1][j]] : 0, sr[i + j]); + } + } + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(5000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +}