diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index d217fb7e94987..71701196af538 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -204,12 +204,6 @@ source %{ return false; } break; - case Op_VectorLoadShuffle: - case Op_VectorRearrange: - if (vlen < 4) { - return false; - } - break; case Op_ExpandV: if (UseSVE < 2 || is_subword_type(bt)) { return false; @@ -6156,61 +6150,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{ // ------------------------------ Vector rearrange ----------------------------- -// Here is an example that rearranges a NEON vector with 4 ints: -// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] -// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3]. -// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1]. -// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1]. -// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404] -// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. -// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100] -// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] -// 6. Use Vm as index register, and use V1 as table register. -// Then get V2 as the result by tbl NEON instructions. -// Notes: -// Step 1 matches VectorLoadConst. -// Step 3 matches VectorLoadShuffle. -// Step 4, 5, 6 match VectorRearrange. -// For VectorRearrange short/int, the reason why such complex calculation is -// required is because NEON tbl supports bytes table only, so for short/int, we -// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl -// to implement rearrange. - -// Maybe move the shuffle preparation to VectorLoadShuffle -instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE == 0 && - (Matcher::vector_element_basic_type(n) == T_SHORT || - (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 && - Matcher::vector_length_in_bytes(n) == 16))); +instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE); match(Set dst (VectorRearrange src shuffle)); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); - format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - if (bt == T_SHORT) { - uint length_in_bytes = Matcher::vector_length_in_bytes(this); - assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); - Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B; - Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H; - __ mov($tmp1$$FloatRegister, size1, 0x02); - __ mov($tmp2$$FloatRegister, size2, 0x0100); - __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister); - } else { - assert(bt == T_INT || bt == T_FLOAT, "unsupported type"); - __ mov($tmp1$$FloatRegister, __ T16B, 0x04); - __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100); - __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister); - } + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); + __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister, + $shuffle$$FloatRegister, $tmp$$FloatRegister, + bt, length_in_bytes == 16); %} ins_pipe(pipe_slow); %} instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ - predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0); + predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE); match(Set dst (VectorRearrange src shuffle)); format %{ "rearrange $dst, $src, $shuffle" %} ins_encode %{ diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 422e98d9b681a..575a37608fdc1 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -194,12 +194,6 @@ source %{ return false; } break; - case Op_VectorLoadShuffle: - case Op_VectorRearrange: - if (vlen < 4) { - return false; - } - break; case Op_ExpandV: if (UseSVE < 2 || is_subword_type(bt)) { return false; @@ -4403,61 +4397,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{ // ------------------------------ Vector rearrange ----------------------------- -// Here is an example that rearranges a NEON vector with 4 ints: -// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] -// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3]. -// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1]. -// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1]. -// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404] -// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. -// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100] -// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] -// 6. Use Vm as index register, and use V1 as table register. -// Then get V2 as the result by tbl NEON instructions. -// Notes: -// Step 1 matches VectorLoadConst. -// Step 3 matches VectorLoadShuffle. -// Step 4, 5, 6 match VectorRearrange. -// For VectorRearrange short/int, the reason why such complex calculation is -// required is because NEON tbl supports bytes table only, so for short/int, we -// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl -// to implement rearrange. - -// Maybe move the shuffle preparation to VectorLoadShuffle -instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE == 0 && - (Matcher::vector_element_basic_type(n) == T_SHORT || - (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 && - Matcher::vector_length_in_bytes(n) == 16))); +instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE); match(Set dst (VectorRearrange src shuffle)); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); - format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - if (bt == T_SHORT) { - uint length_in_bytes = Matcher::vector_length_in_bytes(this); - assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); - Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B; - Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H; - __ mov($tmp1$$FloatRegister, size1, 0x02); - __ mov($tmp2$$FloatRegister, size2, 0x0100); - __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister); - } else { - assert(bt == T_INT || bt == T_FLOAT, "unsupported type"); - __ mov($tmp1$$FloatRegister, __ T16B, 0x04); - __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100); - __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister); - } + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); + __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister, + $shuffle$$FloatRegister, $tmp$$FloatRegister, + bt, length_in_bytes == 16); %} ins_pipe(pipe_slow); %} instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ - predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0); + predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE); match(Set dst (VectorRearrange src shuffle)); format %{ "rearrange $dst, $src, $shuffle" %} ins_encode %{ diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index 97cd00e652279..605a05a44a731 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -2545,6 +2545,64 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, } } +// VectorRearrange implementation for short/int/float/long/double types with NEON +// instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. +// But since it supports bytes table only, we need to lookup 2/4 bytes as a group. +// For VectorRearrange long/double, we compare the shuffle input with iota indices, +// and use bsl to implement the operation. +void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, + FloatRegister shuffle, FloatRegister tmp, + BasicType bt, bool isQ) { + assert_different_registers(dst, src, shuffle, tmp); + SIMD_Arrangement size1 = isQ ? T16B : T8B; + SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); + + // Here is an example that rearranges a NEON vector with 4 ints: + // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] + // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. + // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector + // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get + // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. + // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], + // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] + // 4. Use Vm as index register, and use V1 as table register. + // Then get V2 as the result by tbl NEON instructions. + switch (bt) { + case T_SHORT: + mov(tmp, size1, 0x02); + mulv(dst, size2, shuffle, tmp); + mov(tmp, size2, 0x0100); + addv(dst, size1, dst, tmp); + tbl(dst, size1, src, 1, dst); + break; + case T_INT: + case T_FLOAT: + mov(tmp, size1, 0x04); + mulv(dst, size2, shuffle, tmp); + mov(tmp, size2, 0x03020100); + addv(dst, size1, dst, tmp); + tbl(dst, size1, src, 1, dst); + break; + case T_LONG: + case T_DOUBLE: + // Load the iota indices for Long type. The indices are ordered by + // type B/S/I/L/F/D, and the offset between two types is 16; Hence + // the offset for L is 48. + lea(rscratch1, + ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); + ldrq(tmp, rscratch1); + // Check whether the input "shuffle" is the same with iota indices. + // Return "src" if true, otherwise swap the two elements of "src". + cm(EQ, dst, size2, shuffle, tmp); + ext(tmp, size1, src, src, 8); + bsl(dst, size1, src, tmp); + break; + default: + assert(false, "unsupported element type"); + ShouldNotReachHere(); + } +} + // Extract a scalar element from an sve vector at position 'idx'. // The input elements in src are expected to be of integral type. void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index d61b050407d21..e0eaa0b76e6e9 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -179,6 +179,8 @@ void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ); + void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle, + FloatRegister tmp, BasicType bt, bool isQ); // java.lang.Math::signum intrinsics void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, FloatRegister one, SIMD_Arrangement T); diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 0a5f6c63c7696..76faf7c9d69ed 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -334,6 +334,36 @@ public class IRNode { beforeMatchingNameRegex(OPAQUE_MULTIVERSIONING, "OpaqueMultiversioning"); } + public static final String REARRANGE_VB = VECTOR_PREFIX + "REARRANGE_VB" + POSTFIX; + static { + vectorNode(REARRANGE_VB, "VectorRearrange", TYPE_BYTE); + } + + public static final String REARRANGE_VS = VECTOR_PREFIX + "REARRANGE_VS" + POSTFIX; + static { + vectorNode(REARRANGE_VS, "VectorRearrange", TYPE_SHORT); + } + + public static final String REARRANGE_VI = VECTOR_PREFIX + "REARRANGE_VI" + POSTFIX; + static { + vectorNode(REARRANGE_VI, "VectorRearrange", TYPE_INT); + } + + public static final String REARRANGE_VL = VECTOR_PREFIX + "REARRANGE_VL" + POSTFIX; + static { + vectorNode(REARRANGE_VL, "VectorRearrange", TYPE_LONG); + } + + public static final String REARRANGE_VF = VECTOR_PREFIX + "REARRANGE_VF" + POSTFIX; + static { + vectorNode(REARRANGE_VF, "VectorRearrange", TYPE_FLOAT); + } + + public static final String REARRANGE_VD = VECTOR_PREFIX + "REARRANGE_VD" + POSTFIX; + static { + vectorNode(REARRANGE_VD, "VectorRearrange", TYPE_DOUBLE); + } + public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX; static { String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END; diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java new file mode 100644 index 0000000000000..f2d172b888812 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8350463 + * @summary AArch64: Add vector rearrange support for small lane count vectors + * @modules jdk.incubator.vector + * @library /test/lib / + * + * @run driver compiler.vectorapi.VectorRearrangeTest + */ + +package compiler.vectorapi; + +import compiler.lib.generators.*; +import compiler.lib.ir_framework.*; +import jdk.incubator.vector.*; +import jdk.test.lib.Asserts; + +public class VectorRearrangeTest { + private static final int LENGTH = 1024; + private static final Generators random = Generators.G; + + private static final VectorSpecies bspec128 = ByteVector.SPECIES_128; + private static final VectorSpecies sspec128 = ShortVector.SPECIES_128; + private static final VectorSpecies ispec128 = IntVector.SPECIES_128; + private static final VectorSpecies lspec128 = LongVector.SPECIES_128; + private static final VectorSpecies fspec128 = FloatVector.SPECIES_128; + private static final VectorSpecies dspec128 = DoubleVector.SPECIES_128; + private static final VectorSpecies bspec64 = ByteVector.SPECIES_64; + private static final VectorSpecies sspec64 = ShortVector.SPECIES_64; + private static final VectorSpecies ispec64 = IntVector.SPECIES_64; + private static final VectorSpecies fspec64 = FloatVector.SPECIES_64; + + private static byte[] bsrc; + private static short[] ssrc; + private static int[] isrc; + private static long[] lsrc; + private static float[] fsrc; + private static double[] dsrc; + + private static byte[] bdst; + private static short[] sdst; + private static int[] idst; + private static long[] ldst; + private static float[] fdst; + private static double[] ddst; + + private static int[][] indexes; + + static { + bsrc = new byte[LENGTH]; + ssrc = new short[LENGTH]; + isrc = new int[LENGTH]; + lsrc = new long[LENGTH]; + fsrc = new float[LENGTH]; + dsrc = new double[LENGTH]; + bdst = new byte[LENGTH]; + sdst = new short[LENGTH]; + idst = new int[LENGTH]; + ldst = new long[LENGTH]; + fdst = new float[LENGTH]; + ddst = new double[LENGTH]; + + Generator byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE); + Generator shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE); + for (int i = 0; i < LENGTH; i++) { + bsrc[i] = byteGen.next().byteValue(); + ssrc[i] = shortGen.next().shortValue(); + } + random.fill(random.ints(), isrc); + random.fill(random.longs(), lsrc); + random.fill(random.floats(), fsrc); + random.fill(random.doubles(), dsrc); + + int[] nums = {2, 4, 8, 16}; + indexes = new int[4][]; + for (int i = 0; i < 4; i++) { + indexes[i] = new int[nums[i]]; + random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_byte64() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspec64, indexes[2], 0); + for (int i = 0; i < LENGTH; i += bspec64.length()) { + ByteVector.fromArray(bspec64, bsrc, i) + .rearrange(shuffle) + .intoArray(bdst, i); + } + } + + @Check(test = "rearrange_byte64") + public void rearrange_byte64_verify() { + for (int i = 0; i < LENGTH; i += bspec64.length()) { + for (int j = 0; j < bspec64.length(); j++) { + Asserts.assertEquals(bsrc[indexes[2][j] + i], bdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_byte128() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspec128, indexes[3], 0); + for (int i = 0; i < LENGTH; i += bspec128.length()) { + ByteVector.fromArray(bspec128, bsrc, i) + .rearrange(shuffle) + .intoArray(bdst, i); + } + } + + @Check(test = "rearrange_byte128") + public void rearrange_byte128_verify() { + for (int i = 0; i < LENGTH; i += bspec128.length()) { + for (int j = 0; j < bspec128.length(); j++) { + Asserts.assertEquals(bsrc[indexes[3][j] + i], bdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_short64() { + VectorShuffle shuffle = VectorShuffle.fromArray(sspec64, indexes[1], 0); + for (int i = 0; i < LENGTH; i += sspec64.length()) { + ShortVector.fromArray(sspec64, ssrc, i) + .rearrange(shuffle) + .intoArray(sdst, i); + } + } + + @Check(test = "rearrange_short64") + public void rearrange_short64_verify() { + for (int i = 0; i < LENGTH; i += sspec64.length()) { + for (int j = 0; j < sspec64.length(); j++) { + Asserts.assertEquals(ssrc[indexes[1][j] + i], sdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_short128() { + VectorShuffle shuffle = VectorShuffle.fromArray(sspec128, indexes[2], 0); + for (int i = 0; i < LENGTH; i += sspec128.length()) { + ShortVector.fromArray(sspec128, ssrc, i) + .rearrange(shuffle) + .intoArray(sdst, i); + } + } + + @Check(test = "rearrange_short128") + public void rearrange_short128_verify() { + for (int i = 0; i < LENGTH; i += sspec128.length()) { + for (int j = 0; j < sspec128.length(); j++) { + Asserts.assertEquals(ssrc[indexes[2][j] + i], sdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_int64() { + VectorShuffle shuffle = VectorShuffle.fromArray(ispec64, indexes[0], 0); + for (int i = 0; i < LENGTH; i += ispec64.length()) { + IntVector.fromArray(ispec64, isrc, i) + .rearrange(shuffle) + .intoArray(idst, i); + } + } + + @Check(test = "rearrange_int64") + public void rearrange_int64_verify() { + for (int i = 0; i < LENGTH; i += ispec64.length()) { + for (int j = 0; j < ispec64.length(); j++) { + Asserts.assertEquals(isrc[indexes[0][j] + i], idst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_int128() { + VectorShuffle shuffle = VectorShuffle.fromArray(ispec128, indexes[1], 0); + for (int i = 0; i < LENGTH; i += ispec128.length()) { + IntVector.fromArray(ispec128, isrc, i) + .rearrange(shuffle) + .intoArray(idst, i); + } + } + + @Check(test = "rearrange_int128") + public void rearrange_int128_verify() { + for (int i = 0; i < LENGTH; i += ispec128.length()) { + for (int j = 0; j < ispec128.length(); j++) { + Asserts.assertEquals(isrc[indexes[1][j] + i], idst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VL, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_long128() { + VectorShuffle shuffle = VectorShuffle.fromArray(lspec128, indexes[0], 0); + for (int i = 0; i < LENGTH; i += lspec128.length()) { + LongVector.fromArray(lspec128, lsrc, i) + .rearrange(shuffle) + .intoArray(ldst, i); + } + } + + @Check(test = "rearrange_long128") + public void rearrange_long128_verify() { + for (int i = 0; i < LENGTH; i += lspec128.length()) { + for (int j = 0; j < lspec128.length(); j++) { + Asserts.assertEquals(lsrc[indexes[0][j] + i], ldst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_float64() { + VectorShuffle shuffle = VectorShuffle.fromArray(fspec64, indexes[0], 0); + for (int i = 0; i < LENGTH; i += fspec64.length()) { + FloatVector.fromArray(fspec64, fsrc, i) + .rearrange(shuffle) + .intoArray(fdst, i); + } + } + + @Check(test = "rearrange_float64") + public void rearrange_float64_verify() { + for (int i = 0; i < LENGTH; i += fspec64.length()) { + for (int j = 0; j < fspec64.length(); j++) { + Asserts.assertEquals(fsrc[indexes[0][j] + i], fdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) + public void rearrange_float128() { + VectorShuffle shuffle = VectorShuffle.fromArray(fspec128, indexes[1], 0); + for (int i = 0; i < LENGTH; i += fspec128.length()) { + FloatVector.fromArray(fspec128, fsrc, i) + .rearrange(shuffle) + .intoArray(fdst, i); + } + } + + @Check(test = "rearrange_float128") + public void rearrange_float128_verify() { + for (int i = 0; i < LENGTH; i += fspec128.length()) { + for (int j = 0; j < fspec128.length(); j++) { + Asserts.assertEquals(fsrc[indexes[1][j] + i], fdst[i + j]); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VD, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_double128() { + VectorShuffle shuffle = VectorShuffle.fromArray(dspec128, indexes[0], 0); + for (int i = 0; i < LENGTH; i += dspec128.length()) { + DoubleVector.fromArray(dspec128, dsrc, i) + .rearrange(shuffle) + .intoArray(ddst, i); + } + } + + @Check(test = "rearrange_double128") + public void rearrange_double128_verify() { + for (int i = 0; i < LENGTH; i += dspec128.length()) { + for (int j = 0; j < dspec128.length(); j++) { + Asserts.assertEquals(dsrc[indexes[0][j] + i], ddst[i + j]); + } + } + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(5000) + .addFlags("--add-modules=jdk.incubator.vector") + .start(); + } +}