From 8934fae68ed58e0af126a53ec35c4b1a810ceec7 Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xiaohongg@nvidia.com>
Date: Thu, 20 Feb 2025 02:28:52 +0000
Subject: [PATCH 1/4] 8350463: AArch64: Add vector rearrange support for small
 lane count vectors

The AArch64 vector rearrange implementation currently lacks support for
vector types with lane counts < 4 (see [1]). This limitation results in
significant performance gaps when running Long/Double vector benchmarks
on NVIDIA Grace (SVE2 architecture with 128-bit vectors) compared to
other SVE and x86 platforms.

Vector rearrange operations depend on vector shuffle inputs, which used
byte array as payload previously. The minimum vector lane count of 4 for
byte type on AArch64 imposed this limitation on rearrange operations.
However, vector shuffle payload has been updated to use vector-specific
data types (e.g., `int` for `IntVector`) (see [2]). This change enables
us to remove the lane count restriction for vector rearrange operations.

This patch added the rearrange support for vector types with small lane
count. Here are the main changes:
 - Added AArch64 match rule support for `VectorRearrange` with smaller
   lane counts (e.g., `2D/2S`)
 - Relocated NEON implementation from ad file to c2 macro assembler file
   for better handling of complex implementation
 - Optimized temporary register usage in NEON implementation for
   short/int/float types from two registers to one

Following is the performance improvement data of several Vector API JMH
benchmarks, on a NVIDIA Grace CPU with NEON and SVE. Performance of the
same JMH with other vector types remains unchanged.

1) NEON

JMH on panama-vector:vectorIntrinsics:
```
Benchmark                    (size) Mode   Cnt Units   Before    After   Gain
Double128Vector.rearrange     1024  thrpt  30  ops/ms  78.060   578.859  7.42x
Double128Vector.sliceUnary    1024  thrpt  30  ops/ms  72.332  1811.664  25.05x
Double128Vector.unsliceUnary  1024  thrpt  30  ops/ms  72.256  1812.344  25.08x
Float64Vector.rearrange       1024  thrpt  30  ops/ms  77.879   558.797  7.18x
Float64Vector.sliceUnary      1024  thrpt  30  ops/ms  70.528  1981.304  28.09x
Float64Vector.unsliceUnary    1024  thrpt  30  ops/ms  71.735  1994.168  27.79x
Int64Vector.rearrange         1024  thrpt  30  ops/ms  76.374   562.106  7.36x
Int64Vector.sliceUnary        1024  thrpt  30  ops/ms  71.680  1190.127  16.60x
Int64Vector.unsliceUnary      1024  thrpt  30  ops/ms  71.895  1185.094  16.48x
Long128Vector.rearrange       1024  thrpt  30  ops/ms  78.902   579.250  7.34x
Long128Vector.sliceUnary      1024  thrpt  30  ops/ms  72.389   747.794  10.33x
Long128Vector.unsliceUnary    1024  thrpt  30  ops/ms  71.999   747.848  10.38x
```

JMH on jdk mainline:
```
Benchmark                                     (SIZE) Mode  Cnt  Units   Before   After    Gain
SelectFromBenchmark.rearrangeFromDoubleVector  1024  thrpt  30  ops/ms  44.593  1319.977  29.63x
SelectFromBenchmark.rearrangeFromDoubleVector  2048  thrpt  30  ops/ms  22.318   660.061  29.58x
SelectFromBenchmark.rearrangeFromLongVector    1024  thrpt  30  ops/ms  45.823  1458.144  31.82x
SelectFromBenchmark.rearrangeFromLongVector    2048  thrpt  30  ops/ms  23.050   729.881  31.67x
VectorXXH3HashingBenchmark.hashingKernel       1024  thrpt  30  ops/ms  97.210  1082.884  11.14x
VectorXXH3HashingBenchmark.hashingKernel       2048  thrpt  30  ops/ms  48.642   541.341  11.13x
VectorXXH3HashingBenchmark.hashingKernel       4096  thrpt  30  ops/ms  24.285   270.419  11.14x
VectorXXH3HashingBenchmark.hashingKernel       8192  thrpt  30  ops/ms  12.421   135.115  10.88x
```

2) SVE

JMH on panama-vector:vectorIntrinsics:
```
Benchmark                    (size) Mode   Cnt Units   Before    After   Gain
Double128Vector.rearrange     1024  thrpt  30  ops/ms  78.396   577.744  7.37x
Double128Vector.sliceUnary    1024  thrpt  30  ops/ms  72.119  2538.261  35.19x
Double128Vector.unsliceUnary  1024  thrpt  30  ops/ms  72.992  2536.972  34.75x
Float64Vector.rearrange       1024  thrpt  30  ops/ms  77.400   561.934  7.26x
Float64Vector.sliceUnary      1024  thrpt  30  ops/ms  70.858  2949.076  41.61x
Float64Vector.unsliceUnary    1024  thrpt  30  ops/ms  70.654  2954.273  41.81x
Int64Vector.rearrange         1024  thrpt  30  ops/ms  77.851   563.969  7.24x
Int64Vector.sliceUnary        1024  thrpt  30  ops/ms  67.433  1510.484  22.39x
Int64Vector.unsliceUnary      1024  thrpt  30  ops/ms  66.614  1511.617  22.69x
Long128Vector.rearrange       1024  thrpt  30  ops/ms  77.637   579.021  7.46x
Long128Vector.sliceUnary      1024  thrpt  30  ops/ms  69.886  1274.331  18.23x
Long128Vector.unsliceUnary    1024  thrpt  30  ops/ms  70.069  1273.787  18.17x
```

JMH on jdk mainline:
```
Benchmark                                     (SIZE)  Mode  Cnt Units   Before    After   Gain
SelectFromBenchmark.rearrangeFromDoubleVector  1024  thrpt  30  ops/ms  44.612  1351.850  30.30x
SelectFromBenchmark.rearrangeFromDoubleVector  2048  thrpt  30  ops/ms  22.315   676.314  30.31x
SelectFromBenchmark.rearrangeFromLongVector    1024  thrpt  30  ops/ms  46.372  1502.036  32.39x
SelectFromBenchmark.rearrangeFromLongVector    2048  thrpt  30  ops/ms  23.361   749.133  32.07x
VectorXXH3HashingBenchmark.hashingKernel       1024  thrpt  30  ops/ms  97.780  1759.061  17.99x
VectorXXH3HashingBenchmark.hashingKernel       2048  thrpt  30  ops/ms  48.923   879.584  17.98x
VectorXXH3HashingBenchmark.hashingKernel       4096  thrpt  30  ops/ms  24.219   439.588  18.15x
VectorXXH3HashingBenchmark.hashingKernel       8192  thrpt  30  ops/ms  12.416   219.603  17.69x
```

[1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64_vector.ad#L209
[2] https://bugs.openjdk.org/browse/JDK-8310691
---
 src/hotspot/cpu/aarch64/aarch64_vector.ad     | 65 ++++---------------
 src/hotspot/cpu/aarch64/aarch64_vector_ad.m4  | 65 ++++---------------
 .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 58 +++++++++++++++++
 .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp |  4 +-
 4 files changed, 83 insertions(+), 109 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index d217fb7e94987..71701196af538 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2020, 2024, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -204,12 +204,6 @@ source %{
           return false;
         }
         break;
-      case Op_VectorLoadShuffle:
-      case Op_VectorRearrange:
-        if (vlen < 4) {
-          return false;
-        }
-        break;
       case Op_ExpandV:
         if (UseSVE < 2 || is_subword_type(bt)) {
           return false;
@@ -6156,61 +6150,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
 
 // ------------------------------ Vector rearrange -----------------------------
 
-// Here is an example that rearranges a NEON vector with 4 ints:
-// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
-//   1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3].
-//   2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1].
-//   3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1].
-//   4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404]
-//      and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
-//   5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100]
-//      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
-//   6. Use Vm as index register, and use V1 as table register.
-//      Then get V2 as the result by tbl NEON instructions.
-// Notes:
-//   Step 1 matches VectorLoadConst.
-//   Step 3 matches VectorLoadShuffle.
-//   Step 4, 5, 6 match VectorRearrange.
-//   For VectorRearrange short/int, the reason why such complex calculation is
-//   required is because NEON tbl supports bytes table only, so for short/int, we
-//   need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
-//   to implement rearrange.
-
-// Maybe move the shuffle preparation to VectorLoadShuffle
-instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
-  predicate(UseSVE == 0 &&
-            (Matcher::vector_element_basic_type(n) == T_SHORT ||
-             (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 &&
-              Matcher::vector_length_in_bytes(n) == 16)));
+instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{
+  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
   match(Set dst (VectorRearrange src shuffle));
-  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
-  format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %}
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %}
   ins_encode %{
     BasicType bt = Matcher::vector_element_basic_type(this);
-    if (bt == T_SHORT) {
-      uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
-      Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B;
-      Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H;
-      __ mov($tmp1$$FloatRegister, size1, 0x02);
-      __ mov($tmp2$$FloatRegister, size2, 0x0100);
-      __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
-      __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister);
-      __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister);
-    } else {
-      assert(bt == T_INT || bt == T_FLOAT, "unsupported type");
-      __ mov($tmp1$$FloatRegister, __ T16B, 0x04);
-      __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100);
-      __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
-      __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister);
-      __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister);
-    }
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister,
+                          $shuffle$$FloatRegister, $tmp$$FloatRegister,
+                          bt, length_in_bytes == 16);
   %}
   ins_pipe(pipe_slow);
 %}
 
 instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0);
+  predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE);
   match(Set dst (VectorRearrange src shuffle));
   format %{ "rearrange $dst, $src, $shuffle" %}
   ins_encode %{
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 422e98d9b681a..575a37608fdc1 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2020, 2024, Arm Limited. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -194,12 +194,6 @@ source %{
           return false;
         }
         break;
-      case Op_VectorLoadShuffle:
-      case Op_VectorRearrange:
-        if (vlen < 4) {
-          return false;
-        }
-        break;
       case Op_ExpandV:
         if (UseSVE < 2 || is_subword_type(bt)) {
           return false;
@@ -4403,61 +4397,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{
 
 // ------------------------------ Vector rearrange -----------------------------
 
-// Here is an example that rearranges a NEON vector with 4 ints:
-// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
-//   1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3].
-//   2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1].
-//   3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1].
-//   4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404]
-//      and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
-//   5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100]
-//      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
-//   6. Use Vm as index register, and use V1 as table register.
-//      Then get V2 as the result by tbl NEON instructions.
-// Notes:
-//   Step 1 matches VectorLoadConst.
-//   Step 3 matches VectorLoadShuffle.
-//   Step 4, 5, 6 match VectorRearrange.
-//   For VectorRearrange short/int, the reason why such complex calculation is
-//   required is because NEON tbl supports bytes table only, so for short/int, we
-//   need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
-//   to implement rearrange.
-
-// Maybe move the shuffle preparation to VectorLoadShuffle
-instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
-  predicate(UseSVE == 0 &&
-            (Matcher::vector_element_basic_type(n) == T_SHORT ||
-             (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 &&
-              Matcher::vector_length_in_bytes(n) == 16)));
+instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{
+  predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
   match(Set dst (VectorRearrange src shuffle));
-  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
-  format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %}
+  effect(TEMP_DEF dst, TEMP tmp);
+  format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %}
   ins_encode %{
     BasicType bt = Matcher::vector_element_basic_type(this);
-    if (bt == T_SHORT) {
-      uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-      assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
-      Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B;
-      Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H;
-      __ mov($tmp1$$FloatRegister, size1, 0x02);
-      __ mov($tmp2$$FloatRegister, size2, 0x0100);
-      __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
-      __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister);
-      __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister);
-    } else {
-      assert(bt == T_INT || bt == T_FLOAT, "unsupported type");
-      __ mov($tmp1$$FloatRegister, __ T16B, 0x04);
-      __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100);
-      __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
-      __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister);
-      __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister);
-    }
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
+    __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister,
+                          $shuffle$$FloatRegister, $tmp$$FloatRegister,
+                          bt, length_in_bytes == 16);
   %}
   ins_pipe(pipe_slow);
 %}
 
 instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
-  predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0);
+  predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE);
   match(Set dst (VectorRearrange src shuffle));
   format %{ "rearrange $dst, $src, $shuffle" %}
   ins_encode %{
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
index e3d197a457215..2076e28f61ac6 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
@@ -2549,6 +2549,64 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src,
   }
 }
 
+// VectorRearrange implementation for short/int/float/long/double types with NEON
+// instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
+// But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
+// For VectorRearrange long/double, we compare the shuffle input with iota indices,
+// and use bsl to implement the operation.
+void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
+                                           FloatRegister shuffle, FloatRegister tmp,
+                                           BasicType bt, bool isQ) {
+  assert_different_registers(dst, src, shuffle, tmp);
+  SIMD_Arrangement size1 = isQ ? T16B : T8B;
+  SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);
+
+  // Here is an example that rearranges a NEON vector with 4 ints:
+  // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
+  //   1. We assume the shuffle input is Vi int[2, 3, 0, 1].
+  //   2. Multiply Vi int[2, 3, 0, 1] with constant int vector
+  //      [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
+  //      tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
+  //   3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
+  //      and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
+  //   4. Use Vm as index register, and use V1 as table register.
+  //      Then get V2 as the result by tbl NEON instructions.
+  switch (bt) {
+    case T_SHORT:
+      mov(tmp, size1, 0x02);
+      mulv(dst, size2, shuffle, tmp);
+      mov(tmp, size2, 0x0100);
+      addv(dst, size1, dst, tmp);
+      tbl(dst, size1, src, 1, dst);
+      break;
+    case T_INT:
+    case T_FLOAT:
+      mov(tmp, size1, 0x04);
+      mulv(dst, size2, shuffle, tmp);
+      mov(tmp, size2, 0x03020100);
+      addv(dst, size1, dst, tmp);
+      tbl(dst, size1, src, 1, dst);
+      break;
+    case T_LONG:
+    case T_DOUBLE:
+      // Load the iota indices for Long type. The indices are ordered by
+      // type B/S/I/L/F/D, and the offset between two types is 16; Hence
+      // the offset for L is 48.
+      lea(rscratch1,
+          ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
+      ldrq(tmp, rscratch1);
+      // Check whether the input "shuffle" is the same with iota indices.
+      // Return "src" if true, otherwise swap the two elements of "src".
+      cm(EQ, dst, size2, shuffle, tmp);
+      ext(tmp, size1, src, src, 8);
+      bsl(dst, size1, src, tmp);
+      break;
+    default:
+      assert(false, "unsupported element type");
+      ShouldNotReachHere();
+  }
+}
+
 // Extract a scalar element from an sve vector at position 'idx'.
 // The input elements in src are expected to be of integral type.
 void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
index d61b050407d21..e0eaa0b76e6e9 100644
--- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -179,6 +179,8 @@
 
   void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);
 
+  void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle,
+                          FloatRegister tmp, BasicType bt, bool isQ);
   // java.lang.Math::signum intrinsics
   void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
                           FloatRegister one, SIMD_Arrangement T);

From c0ebfa435ca1a9fbf30f8e3b8566ebdf4de914fc Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xiaohongg@nvidia.com>
Date: Fri, 14 Mar 2025 09:33:57 +0000
Subject: [PATCH 2/4] Add the IR test

---
 .../compiler/lib/ir_framework/IRNode.java     |  30 +++
 .../vectorapi/VectorRearrangeTest.java        | 222 ++++++++++++++++++
 2 files changed, 252 insertions(+)
 create mode 100644 test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java

diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 8f28294a98685..dcc9186660dea 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -329,6 +329,36 @@ public class IRNode {
         superWordNodes(ADD_REDUCTION_VL, "AddReductionVL");
     }
 
+    public static final String REARRANGE_VB = VECTOR_PREFIX + "REARRANGE_VB" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VB, "VectorRearrange", TYPE_BYTE);
+    }
+
+    public static final String REARRANGE_VS = VECTOR_PREFIX + "REARRANGE_VS" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VS, "VectorRearrange", TYPE_SHORT);
+    }
+
+    public static final String REARRANGE_VI = VECTOR_PREFIX + "REARRANGE_VI" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VI, "VectorRearrange", TYPE_INT);
+    }
+
+    public static final String REARRANGE_VL = VECTOR_PREFIX + "REARRANGE_VL" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VL, "VectorRearrange", TYPE_LONG);
+    }
+
+    public static final String REARRANGE_VF = VECTOR_PREFIX + "REARRANGE_VF" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VF, "VectorRearrange", TYPE_FLOAT);
+    }
+
+    public static final String REARRANGE_VD = VECTOR_PREFIX + "REARRANGE_VD" + POSTFIX;
+    static {
+        vectorNode(REARRANGE_VD, "VectorRearrange", TYPE_DOUBLE);
+    }
+
     public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX;
     static {
         String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END;
diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
new file mode 100644
index 0000000000000..97ee86544ae31
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8350463
+ * @summary AArch64: Add vector rearrange support for small lane count vectors
+ * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx.*") | os.arch=="aarch64"
+ * @modules jdk.incubator.vector
+ * @library /test/lib /
+ *
+ * @run driver compiler.vectorapi.VectorRearrangeTest
+ */
+
+package compiler.vectorapi;
+
+import jdk.incubator.vector.*;
+import compiler.lib.ir_framework.*;
+import java.util.Random;
+import jdk.test.lib.Utils;
+
+public class VectorRearrangeTest {
+    private static final int LENGTH = 2048;
+    private static final Random random = Utils.getRandomInstance();
+
+    private static final VectorSpecies<Byte> bspec128    = ByteVector.SPECIES_128;
+    private static final VectorSpecies<Short> sspec128   = ShortVector.SPECIES_128;
+    private static final VectorSpecies<Integer> ispec128 = IntVector.SPECIES_128;
+    private static final VectorSpecies<Long> lspec128    = LongVector.SPECIES_128;
+    private static final VectorSpecies<Float> fspec128   = FloatVector.SPECIES_128;
+    private static final VectorSpecies<Double> dspec128  = DoubleVector.SPECIES_128;
+    private static final VectorSpecies<Byte> bspec64     = ByteVector.SPECIES_64;
+    private static final VectorSpecies<Short> sspec64    = ShortVector.SPECIES_64;
+    private static final VectorSpecies<Integer> ispec64  = IntVector.SPECIES_64;
+    private static final VectorSpecies<Float> fspec64    = FloatVector.SPECIES_64;
+
+    private static byte[]   bsrc;
+    private static short[]  ssrc;
+    private static int[]    isrc;
+    private static long[]   lsrc;
+    private static float[]  fsrc;
+    private static double[] dsrc;
+
+    private static byte[]   bdst;
+    private static short[]  sdst;
+    private static int[]    idst;
+    private static long[]   ldst;
+    private static float[]  fdst;
+    private static double[] ddst;
+
+    private static int[][] indexes;
+
+    static {
+        bsrc = new byte[LENGTH];
+        ssrc = new short[LENGTH];
+        isrc = new int[LENGTH];
+        lsrc = new long[LENGTH];
+        fsrc = new float[LENGTH];
+        dsrc = new double[LENGTH];
+        bdst = new byte[LENGTH];
+        sdst = new short[LENGTH];
+        idst = new int[LENGTH];
+        ldst = new long[LENGTH];
+        fdst = new float[LENGTH];
+        ddst = new double[LENGTH];
+
+        for (int i = 0; i < LENGTH; ++i) {
+            bsrc[i] = (byte)random.nextInt();
+            ssrc[i] = (short)random.nextInt();
+            isrc[i] = random.nextInt();
+            lsrc[i] = random.nextLong();
+            fsrc[i] = random.nextFloat();
+            dsrc[i] = random.nextDouble();
+        }
+
+        int[] nums = {2, 4, 8, 16};
+        indexes = new int[4][];
+        for (int i = 0; i < 4; i++) {
+            indexes[i] = new int[nums[i]];
+            for (int j = 0; j < nums[i]; j++) {
+                indexes[i][j] = random.nextInt() & (nums[i] - 1);
+            }
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "})
+    public void rearrange_byte64() {
+        VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspec64, indexes[2], 0);
+        for (int i = 0; i < LENGTH; i += bspec64.length()) {
+            ByteVector.fromArray(bspec64, bsrc, i)
+                      .rearrange(shuffle)
+                      .intoArray(bdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "})
+    public void rearrange_byte128() {
+        VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspec128, indexes[3], 0);
+        for (int i = 0; i < LENGTH; i += bspec128.length()) {
+            ByteVector.fromArray(bspec128, bsrc, i)
+                      .rearrange(shuffle)
+                      .intoArray(bdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "})
+    public void rearrange_short64() {
+        VectorShuffle<Short> shuffle = VectorShuffle.fromArray(sspec64, indexes[1], 0);
+        for (int i = 0; i < LENGTH; i += sspec64.length()) {
+            ShortVector.fromArray(sspec64, ssrc, i)
+                       .rearrange(shuffle)
+                       .intoArray(sdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "})
+    public void rearrange_short128() {
+        VectorShuffle<Short> shuffle = VectorShuffle.fromArray(sspec128, indexes[2], 0);
+        for (int i = 0; i < LENGTH; i += sspec128.length()) {
+            ShortVector.fromArray(sspec128, ssrc, i)
+                       .rearrange(shuffle)
+                       .intoArray(sdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
+    public void rearrange_int64() {
+        VectorShuffle<Integer> shuffle = VectorShuffle.fromArray(ispec64, indexes[0], 0);
+        for (int i = 0; i < LENGTH; i += ispec64.length()) {
+            IntVector.fromArray(ispec64, isrc, i)
+                     .rearrange(shuffle)
+                     .intoArray(idst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "})
+    public void rearrange_int128() {
+        VectorShuffle<Integer> shuffle = VectorShuffle.fromArray(ispec128, indexes[1], 0);
+        for (int i = 0; i < LENGTH; i += ispec128.length()) {
+            IntVector.fromArray(ispec128, isrc, i)
+                     .rearrange(shuffle)
+                     .intoArray(idst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VL, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
+    public void rearrange_long128() {
+        VectorShuffle<Long> shuffle = VectorShuffle.fromArray(lspec128, indexes[0], 0);
+        for (int i = 0; i < LENGTH; i += lspec128.length()) {
+            LongVector.fromArray(lspec128, lsrc, i)
+                      .rearrange(shuffle)
+                      .intoArray(ldst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
+    public void rearrange_float64() {
+        VectorShuffle<Float> shuffle = VectorShuffle.fromArray(fspec64, indexes[0], 0);
+        for (int i = 0; i < LENGTH; i += fspec64.length()) {
+            FloatVector.fromArray(fspec64, fsrc, i)
+                       .rearrange(shuffle)
+                       .intoArray(fdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "})
+    public void rearrange_float128() {
+        VectorShuffle<Float> shuffle = VectorShuffle.fromArray(fspec128, indexes[1], 0);
+        for (int i = 0; i < LENGTH; i += fspec128.length()) {
+            FloatVector.fromArray(fspec128, fsrc, i)
+                       .rearrange(shuffle)
+                       .intoArray(fdst, i);
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.REARRANGE_VD, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
+    public void rearrange_double128() {
+        VectorShuffle<Double> shuffle = VectorShuffle.fromArray(dspec128, indexes[0], 0);
+        for (int i = 0; i < LENGTH; i += dspec128.length()) {
+            DoubleVector.fromArray(dspec128, dsrc, i)
+                        .rearrange(shuffle)
+                        .intoArray(ddst, i);
+        }
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(10000)
+                     .addFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation")
+                     .start();
+    }
+}

From 1cbff61fbcf4c289048abcc076be7f3a98f4ac03 Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xiaohongg@nvidia.com>
Date: Tue, 18 Mar 2025 02:31:47 +0000
Subject: [PATCH 3/4] Update IR test based on the review comment

---
 .../vectorapi/VectorRearrangeTest.java        | 133 +++++++++++++++---
 1 file changed, 111 insertions(+), 22 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
index 97ee86544ae31..a2638504235ea 100644
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
@@ -25,7 +25,6 @@
  * @test
  * @bug 8350463
  * @summary AArch64: Add vector rearrange support for small lane count vectors
- * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx.*") | os.arch=="aarch64"
  * @modules jdk.incubator.vector
  * @library /test/lib /
  *
@@ -34,14 +33,14 @@
 
 package compiler.vectorapi;
 
-import jdk.incubator.vector.*;
+import compiler.lib.generators.*;
 import compiler.lib.ir_framework.*;
-import java.util.Random;
-import jdk.test.lib.Utils;
+import jdk.incubator.vector.*;
+import jdk.test.lib.Asserts;
 
 public class VectorRearrangeTest {
     private static final int LENGTH = 2048;
-    private static final Random random = Utils.getRandomInstance();
+    private static final Generators random = Generators.G;
 
     private static final VectorSpecies<Byte> bspec128    = ByteVector.SPECIES_128;
     private static final VectorSpecies<Short> sspec128   = ShortVector.SPECIES_128;
@@ -84,27 +83,27 @@ public class VectorRearrangeTest {
         fdst = new float[LENGTH];
         ddst = new double[LENGTH];
 
-        for (int i = 0; i < LENGTH; ++i) {
-            bsrc[i] = (byte)random.nextInt();
-            ssrc[i] = (short)random.nextInt();
-            isrc[i] = random.nextInt();
-            lsrc[i] = random.nextLong();
-            fsrc[i] = random.nextFloat();
-            dsrc[i] = random.nextDouble();
+        Generator<Integer> byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE);
+        Generator<Integer> shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE);
+        for (int i = 0; i < LENGTH; i++) {
+            bsrc[i] = byteGen.next().byteValue();
+            ssrc[i] = shortGen.next().shortValue();
         }
+        random.fill(random.ints(), isrc);
+        random.fill(random.longs(), lsrc);
+        random.fill(random.floats(), fsrc);
+        random.fill(random.doubles(), dsrc);
 
         int[] nums = {2, 4, 8, 16};
         indexes = new int[4][];
         for (int i = 0; i < 4; i++) {
             indexes[i] = new int[nums[i]];
-            for (int j = 0; j < nums[i]; j++) {
-                indexes[i][j] = random.nextInt() & (nums[i] - 1);
-            }
+            random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]);
         }
     }
 
     @Test
-    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_byte64() {
         VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspec64, indexes[2], 0);
         for (int i = 0; i < LENGTH; i += bspec64.length()) {
@@ -114,8 +113,17 @@ public void rearrange_byte64() {
         }
     }
 
+    @Check(test = "rearrange_byte64")
+    public void rearrange_byte64_verify() {
+        for (int i = 0; i < LENGTH; i += bspec64.length()) {
+            for (int j = 0; j < bspec64.length(); j++) {
+                Asserts.assertEquals(bsrc[indexes[2][j] + i], bdst[i + j]);
+            }
+        }
+    }
+
     @Test
-    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_byte128() {
         VectorShuffle<Byte> shuffle = VectorShuffle.fromArray(bspec128, indexes[3], 0);
         for (int i = 0; i < LENGTH; i += bspec128.length()) {
@@ -125,8 +133,17 @@ public void rearrange_byte128() {
         }
     }
 
+    @Check(test = "rearrange_byte128")
+    public void rearrange_byte128_verify() {
+        for (int i = 0; i < LENGTH; i += bspec128.length()) {
+            for (int j = 0; j < bspec128.length(); j++) {
+                Asserts.assertEquals(bsrc[indexes[3][j] + i], bdst[i + j]);
+            }
+        }
+    }
+
     @Test
-    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_short64() {
         VectorShuffle<Short> shuffle = VectorShuffle.fromArray(sspec64, indexes[1], 0);
         for (int i = 0; i < LENGTH; i += sspec64.length()) {
@@ -136,8 +153,17 @@ public void rearrange_short64() {
         }
     }
 
+    @Check(test = "rearrange_short64")
+    public void rearrange_short64_verify() {
+        for (int i = 0; i < LENGTH; i += sspec64.length()) {
+            for (int j = 0; j < sspec64.length(); j++) {
+                Asserts.assertEquals(ssrc[indexes[1][j] + i], sdst[i + j]);
+            }
+        }
+    }
+
     @Test
-    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_short128() {
         VectorShuffle<Short> shuffle = VectorShuffle.fromArray(sspec128, indexes[2], 0);
         for (int i = 0; i < LENGTH; i += sspec128.length()) {
@@ -147,6 +173,15 @@ public void rearrange_short128() {
         }
     }
 
+    @Check(test = "rearrange_short128")
+    public void rearrange_short128_verify() {
+        for (int i = 0; i < LENGTH; i += sspec128.length()) {
+            for (int j = 0; j < sspec128.length(); j++) {
+                Asserts.assertEquals(ssrc[indexes[2][j] + i], sdst[i + j]);
+            }
+        }
+    }
+
     @Test
     @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
     public void rearrange_int64() {
@@ -158,8 +193,17 @@ public void rearrange_int64() {
         }
     }
 
+    @Check(test = "rearrange_int64")
+    public void rearrange_int64_verify() {
+        for (int i = 0; i < LENGTH; i += ispec64.length()) {
+            for (int j = 0; j < ispec64.length(); j++) {
+                Asserts.assertEquals(isrc[indexes[0][j] + i], idst[i + j]);
+            }
+        }
+    }
+
     @Test
-    @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_int128() {
         VectorShuffle<Integer> shuffle = VectorShuffle.fromArray(ispec128, indexes[1], 0);
         for (int i = 0; i < LENGTH; i += ispec128.length()) {
@@ -169,6 +213,15 @@ public void rearrange_int128() {
         }
     }
 
+    @Check(test = "rearrange_int128")
+    public void rearrange_int128_verify() {
+        for (int i = 0; i < LENGTH; i += ispec128.length()) {
+            for (int j = 0; j < ispec128.length(); j++) {
+                Asserts.assertEquals(isrc[indexes[1][j] + i], idst[i + j]);
+            }
+        }
+    }
+
     @Test
     @IR(counts = {IRNode.REARRANGE_VL, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
     public void rearrange_long128() {
@@ -180,6 +233,15 @@ public void rearrange_long128() {
         }
     }
 
+    @Check(test = "rearrange_long128")
+    public void rearrange_long128_verify() {
+        for (int i = 0; i < LENGTH; i += lspec128.length()) {
+            for (int j = 0; j < lspec128.length(); j++) {
+                Asserts.assertEquals(lsrc[indexes[0][j] + i], ldst[i + j]);
+            }
+        }
+    }
+
     @Test
     @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
     public void rearrange_float64() {
@@ -191,8 +253,17 @@ public void rearrange_float64() {
         }
     }
 
+    @Check(test = "rearrange_float64")
+    public void rearrange_float64_verify() {
+        for (int i = 0; i < LENGTH; i += fspec64.length()) {
+            for (int j = 0; j < fspec64.length(); j++) {
+                Asserts.assertEquals(fsrc[indexes[0][j] + i], fdst[i + j]);
+            }
+        }
+    }
+
     @Test
-    @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "})
+    @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
     public void rearrange_float128() {
         VectorShuffle<Float> shuffle = VectorShuffle.fromArray(fspec128, indexes[1], 0);
         for (int i = 0; i < LENGTH; i += fspec128.length()) {
@@ -202,6 +273,15 @@ public void rearrange_float128() {
         }
     }
 
+    @Check(test = "rearrange_float128")
+    public void rearrange_float128_verify() {
+        for (int i = 0; i < LENGTH; i += fspec128.length()) {
+            for (int j = 0; j < fspec128.length(); j++) {
+                Asserts.assertEquals(fsrc[indexes[1][j] + i], fdst[i + j]);
+            }
+        }
+    }
+
     @Test
     @IR(counts = {IRNode.REARRANGE_VD, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"})
     public void rearrange_double128() {
@@ -213,10 +293,19 @@ public void rearrange_double128() {
         }
     }
 
+    @Check(test = "rearrange_double128")
+    public void rearrange_double128_verify() {
+        for (int i = 0; i < LENGTH; i += dspec128.length()) {
+            for (int j = 0; j < dspec128.length(); j++) {
+                Asserts.assertEquals(dsrc[indexes[0][j] + i], ddst[i + j]);
+            }
+        }
+    }
+
     public static void main(String[] args) {
         TestFramework testFramework = new TestFramework();
         testFramework.setDefaultWarmup(10000)
-                     .addFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation")
+                     .addFlags("--add-modules=jdk.incubator.vector")
                      .start();
     }
 }

From 5249c9ec408dd5a8341b3ccb0dacb09c6e09220d Mon Sep 17 00:00:00 2001
From: Xiaohong Gong <xiaohongg@nvidia.com>
Date: Thu, 20 Mar 2025 02:40:18 +0000
Subject: [PATCH 4/4] Use a smaller warmup and array length in IR test

---
 .../hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
index a2638504235ea..f2d172b888812 100644
--- a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java
@@ -39,7 +39,7 @@
 import jdk.test.lib.Asserts;
 
 public class VectorRearrangeTest {
-    private static final int LENGTH = 2048;
+    private static final int LENGTH = 1024;
     private static final Generators random = Generators.G;
 
     private static final VectorSpecies<Byte> bspec128    = ByteVector.SPECIES_128;
@@ -304,7 +304,7 @@ public void rearrange_double128_verify() {
 
     public static void main(String[] args) {
         TestFramework testFramework = new TestFramework();
-        testFramework.setDefaultWarmup(10000)
+        testFramework.setDefaultWarmup(5000)
                      .addFlags("--add-modules=jdk.incubator.vector")
                      .start();
     }