diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad
index 76e3c92ddc261..3cd88eec8506e 100644
--- a/src/hotspot/cpu/aarch64/aarch64.ad
+++ b/src/hotspot/cpu/aarch64/aarch64.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2014, 2024, Red Hat, Inc. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -2385,6 +2385,20 @@ uint Matcher::vector_ideal_reg(int len) {
   return 0;
 }
 
+// Vector ideal reg size corresponding to the specified len in bytes
+uint Matcher::vector_ideal_reg_size(int len) {
+  assert(MaxVectorSize >= len, "");
+  uint ideal_reg = vector_ideal_reg(len);
+  switch (ideal_reg) {
+    case Op_VecD: return 8;
+    case Op_VecX: return 16;
+    case Op_VecA: return MaxVectorSize;
+    default:
+      ShouldNotReachHere();
+      return 0;
+  }
+}
+
 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
   switch (ideal_reg) {
@@ -2631,12 +2645,13 @@ bool Matcher::pd_clone_node(Node* n, Node* m, Matcher::MStack& mstack) {
 // into registers?
 bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
 
-  // Loads and stores with indirect memory input (e.g., volatile loads and
-  // stores) do not subsume the input into complex addressing expressions. If
-  // the addressing expression is input to at least one such load or store, do
-  // not clone the addressing expression. Query needs_acquiring_load and
-  // needs_releasing_store as a proxy for indirect memory input, as it is not
-  // possible to directly query for indirect memory input at this stage.
+  // Loads and stores with indirect memory input (e.g., volatile loads/stores,
+  // and vector gather_loads/scatter_stores) do not subsume the input into
+  // complex addressing expressions. If the addressing expression is input
+  // to at least one such load or store, do not clone the addressing expression.
+  // Query needs_acquiring_load and needs_releasing_store as a proxy for
+  // indirect memory input, as it is not possible to directly query for indirect
+  // memory input at this stage.
   for (DUIterator_Fast imax, i = m->fast_outs(imax); i < imax; i++) {
     Node* n = m->fast_out(i);
     if (n->is_Load() && needs_acquiring_load(n)) {
@@ -2645,6 +2660,13 @@ bool Matcher::pd_clone_address_expressions(AddPNode* m, Matcher::MStack& mstack,
     if (n->is_Store() && needs_releasing_store(n)) {
       return false;
     }
+
+    if (n->is_LoadVectorGather() ||
+        n->is_StoreVectorScatter() ||
+        n->is_LoadVectorGatherMasked() ||
+        n->is_StoreVectorScatterMasked()) {
+      return false;
+    }
   }
 
   if (clone_base_plus_offset_address(m, mstack, address_visited)) {
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index c7a0fc5724b2c..2f7a399a7dc3d 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -168,22 +168,21 @@ source %{
       case Op_MaskAll:
       case Op_VectorMaskGen:
       case Op_LoadVectorMasked:
+      case Op_LoadVectorGather:
+      case Op_LoadVectorGatherMasked:
       case Op_StoreVectorMasked:
       case Op_StoreVectorScatter:
       case Op_StoreVectorScatterMasked:
       case Op_PopulateIndex:
       case Op_CompressM:
       case Op_CompressV:
+      // Temporarily disable vector mask widen support for NEON,
+      // as we do not have the use case now.
+      case Op_VectorMaskWiden:
         if (UseSVE == 0) {
           return false;
         }
         break;
-      case Op_LoadVectorGather:
-      case Op_LoadVectorGatherMasked:
-        if (UseSVE == 0 || is_subword_type(bt)) {
-          return false;
-        }
-        break;
       case Op_MulAddVS2VI:
         if (length_in_bytes != 16) {
           return false;
@@ -325,6 +324,11 @@ source %{
     return false;
   }
 
+  // SVE always needs the vector index for gather/scatter.
+  bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+    return true;
+  }
+
   // Assert that the given node is not a variable shift.
   bool assert_not_var_shift(const Node* n) {
     assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
@@ -5075,6 +5079,35 @@ instruct extractD(vRegD dst, vReg src, immI idx) %{
   ins_pipe(pipe_slow);
 %}
 
+// ---------------------------- Vector Slice ------------------------
+
+instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{
+   predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+   match(Set dst (VectorSlice (Binary src1 src2) index));
+   format %{ "vslice_neon $dst, $src1, $src2, $index" %}
+   ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
+    __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
+           $src1$$FloatRegister, $src2$$FloatRegister,
+           ((uint)$index$$constant * scale));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{
+   predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+   match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index));
+   format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %}
+   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
+    __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister,
+               ((uint)$index$$constant * scale));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ------------------------------ Vector mask load/store -----------------------
 
 // vector load mask
@@ -5738,6 +5771,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{
   ins_pipe(pipe_slow);
 %}
 
+// Vector mask widen to twice size
+//
+// Unpack elements from the lowest or highest half of the source
+// predicate and place in elements of twice their size within the
+// destination predicate.
+
+instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{
+  predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo());
+  match(Set dst (VectorMaskWiden src));
+  format %{ "vmaskwiden_lo_sve $dst, $src" %}
+  ins_encode %{
+    __ sve_punpklo($dst$$PRegister, $src$$PRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{
+  predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo());
+  match(Set dst (VectorMaskWiden src));
+  format %{ "vmaskwiden_hi_sve $dst, $src" %}
+  ins_encode %{
+    __ sve_punpkhi($dst$$PRegister, $src$$PRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector mask reinterpret
 
 instruct vmask_reinterpret_same_esize(pReg dst_src) %{
@@ -6471,6 +6530,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
 
 // ------------------------------ Vector Load Gather ---------------------------
 
+instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16);
+  match(Set dst (LoadVectorGather mem idx));
+  effect(TEMP_DEF dst);
+  format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16);
+  match(Set dst (LoadVectorGather mem idx));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_dup($vtmp$$FloatRegister, __ S, 0);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
   predicate(UseSVE > 0 &&
             type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
@@ -6481,7 +6589,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_ld1w_gather($dst$$FloatRegister, ptrue,
                        as_Register($mem$$base), $idx$$FloatRegister);
- %}
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -6501,6 +6609,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16);
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
+  effect(TEMP_DEF dst);
+  format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16);
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_dup($vtmp$$FloatRegister, __ S, 0);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
   predicate(UseSVE > 0 &&
             type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 171bc39054549..3b2317501ce14 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -158,22 +158,21 @@ source %{
       case Op_MaskAll:
       case Op_VectorMaskGen:
       case Op_LoadVectorMasked:
+      case Op_LoadVectorGather:
+      case Op_LoadVectorGatherMasked:
       case Op_StoreVectorMasked:
       case Op_StoreVectorScatter:
       case Op_StoreVectorScatterMasked:
       case Op_PopulateIndex:
       case Op_CompressM:
       case Op_CompressV:
+      // Temporarily disable vector mask widen support for NEON,
+      // as we do not have the use case now.
+      case Op_VectorMaskWiden:
         if (UseSVE == 0) {
           return false;
         }
         break;
-      case Op_LoadVectorGather:
-      case Op_LoadVectorGatherMasked:
-        if (UseSVE == 0 || is_subword_type(bt)) {
-          return false;
-        }
-        break;
       case Op_MulAddVS2VI:
         if (length_in_bytes != 16) {
           return false;
@@ -315,6 +314,11 @@ source %{
     return false;
   }
 
+  // SVE always needs the vector index for gather/scatter.
+  bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+    return true;
+  }
+
   // Assert that the given node is not a variable shift.
   bool assert_not_var_shift(const Node* n) {
     assert(!n->as_ShiftV()->is_var_shift(), "illegal variable shift");
@@ -3313,6 +3317,35 @@ EXTRACT_FP(F, fmovs, 4, S, 2)
 // DOUBLE
 EXTRACT_FP(D, fmovd, 2, D, 3)
 
+// ---------------------------- Vector Slice ------------------------
+
+instruct vslice_neon(vReg dst, vReg src1, vReg src2, immI index) %{
+   predicate(VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+   match(Set dst (VectorSlice (Binary src1 src2) index));
+   format %{ "vslice_neon $dst, $src1, $src2, $index" %}
+   ins_encode %{
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
+    uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
+    __ ext($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
+           $src1$$FloatRegister, $src2$$FloatRegister,
+           ((uint)$index$$constant * scale));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vslice_sve(vReg dst_src1, vReg src2, immI index) %{
+   predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)));
+   match(Set dst_src1 (VectorSlice (Binary dst_src1 src2) index));
+   format %{ "vslice_sve $dst_src1, $dst_src1, $src2, $index" %}
+   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint scale = type2aelembytes(Matcher::vector_element_basic_type(this));
+    __ sve_ext($dst_src1$$FloatRegister, $src2$$FloatRegister,
+               ((uint)$index$$constant * scale));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // ------------------------------ Vector mask load/store -----------------------
 
 // vector load mask
@@ -3885,6 +3918,32 @@ instruct vmaskcast_narrow_sve(pReg dst, pReg src, pReg ptmp) %{
   ins_pipe(pipe_slow);
 %}
 
+// Vector mask widen to twice size
+//
+// Unpack elements from the lowest or highest half of the source
+// predicate and place in elements of twice their size within the
+// destination predicate.
+
+instruct vmaskwiden_lo_sve(pReg dst, pReg src) %{
+  predicate(UseSVE > 0 && n->as_VectorMaskWiden()->is_lo());
+  match(Set dst (VectorMaskWiden src));
+  format %{ "vmaskwiden_lo_sve $dst, $src" %}
+  ins_encode %{
+    __ sve_punpklo($dst$$PRegister, $src$$PRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmaskwiden_hi_sve(pReg dst, pReg src) %{
+  predicate(UseSVE > 0 && !n->as_VectorMaskWiden()->is_lo());
+  match(Set dst (VectorMaskWiden src));
+  format %{ "vmaskwiden_hi_sve $dst, $src" %}
+  ins_encode %{
+    __ sve_punpkhi($dst$$PRegister, $src$$PRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 // vector mask reinterpret
 
 instruct vmask_reinterpret_same_esize(pReg dst_src) %{
@@ -4574,6 +4633,55 @@ instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
 
 // ------------------------------ Vector Load Gather ---------------------------
 
+instruct gather_load_subword_le128(vReg dst, indirect mem, vReg idx) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) <= 16);
+  match(Set dst (LoadVectorGather mem idx));
+  effect(TEMP_DEF dst);
+  format %{ "gather_load_subword_le128 $dst, $mem, $idx\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct gather_load_subword_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGather()->in(3)) > 16);
+  match(Set dst (LoadVectorGather mem idx));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  format %{ "gather_load_subword_gt128 $dst, $mem, $idx\t# vector (sve). KILL $vtmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_dup($vtmp$$FloatRegister, __ S, 0);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, ptrue,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
   predicate(UseSVE > 0 &&
             type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
@@ -4584,7 +4692,7 @@ instruct gather_loadS(vReg dst, indirect mem, vReg idx) %{
     assert(length_in_bytes == MaxVectorSize, "invalid vector length");
     __ sve_ld1w_gather($dst$$FloatRegister, ptrue,
                        as_Register($mem$$base), $idx$$FloatRegister);
- %}
+  %}
   ins_pipe(pipe_slow);
 %}
 
@@ -4604,6 +4712,55 @@ instruct gather_loadD(vReg dst, indirect mem, vReg idx, vReg tmp) %{
   ins_pipe(pipe_slow);
 %}
 
+instruct gather_load_subword_masked_le128(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) <= 16);
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
+  effect(TEMP_DEF dst);
+  format %{ "gather_load_subword_masked_le128 $dst, $pg, $mem, $idx\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+      __ xtn($dst$$FloatRegister, __ T8B, $dst$$FloatRegister, __ T8H);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ xtn($dst$$FloatRegister, __ T4H, $dst$$FloatRegister, __ T4S);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct gather_load_subword_masked_gt128(vReg dst, indirect mem, vReg idx, vReg vtmp, pRegGov pg) %{
+  predicate(UseSVE > 0 &&
+            type2aelembytes(Matcher::vector_element_basic_type(n)) <= 2 &&
+            Matcher::vector_length_in_bytes(n->as_LoadVectorGatherMasked()->in(3)->in(1)) > 16);
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx pg)));
+  effect(TEMP_DEF dst, TEMP vtmp);
+  format %{ "gather_load_subword_masked_gt128 $dst, $pg, $mem, $idx\t# vector (sve). KILL $vtmp" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    __ sve_dup($vtmp$$FloatRegister, __ S, 0);
+    if (bt == T_BYTE) {
+      __ sve_ld1b_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ B, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    } else {
+      assert(bt == T_SHORT, "unsupported type");
+      __ sve_ld1h_gather($dst$$FloatRegister, $pg$$PRegister,
+                         as_Register($mem$$base), $idx$$FloatRegister);
+      __ sve_uzp1($dst$$FloatRegister, __ H, $dst$$FloatRegister, $vtmp$$FloatRegister);
+    }
+  %}
+  ins_pipe(pipe_slow);
+%}
+
 instruct gather_loadS_masked(vReg dst, indirect mem, vReg idx, pRegGov pg) %{
   predicate(UseSVE > 0 &&
             type2aelembytes(Matcher::vector_element_basic_type(n)) == 4);
diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
index 3db7d30884429..8625481678c1e 100644
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -3616,6 +3616,10 @@ template<typename R, typename... Rx>
     f(op1, 31, 25), f(type, 24, 23), f(op2, 22, 21), rf(Zm, 16);                \
     f(op3, 15, 13), pgrf(Pg, 10), srf(Xn, 5), rf(Zt, 0);                        \
   }
+  // SVE 8-bit gather load bytes (scalar plus 32-bit unscaled offsets)
+  INSN(sve_ld1b_gather,  0b1000010, 0b00, 0b00, 0b010);
+  // SVE 16-bit gather load halfwords (scalar plus 32-bit scaled offsets)
+  INSN(sve_ld1h_gather,  0b1000010, 0b01, 0b01, 0b010);
   // SVE 32-bit gather load words (scalar plus 32-bit scaled offsets)
   INSN(sve_ld1w_gather,  0b1000010, 0b10, 0b01, 0b010);
   // SVE 64-bit gather load (scalar plus 32-bit unpacked scaled offsets)
diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad
index f3b97d23ad306..5c1bcb2e53fb7 100644
--- a/src/hotspot/cpu/arm/arm.ad
+++ b/src/hotspot/cpu/arm/arm.ad
@@ -1003,6 +1003,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
@@ -1037,6 +1041,12 @@ uint Matcher::vector_ideal_reg(int size) {
   return 0;
 }
 
+// Vector ideal reg size corresponding to the specified size in bytes
+uint Matcher::vector_ideal_reg_size(int size) {
+  assert(MaxVectorSize >= size, "");
+  return size;
+}
+
 // Limits on vector size (number of elements) loaded into vector.
 int Matcher::max_vector_size(const BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad
index 07d681e89823e..b71250ebdf80f 100644
--- a/src/hotspot/cpu/ppc/ppc.ad
+++ b/src/hotspot/cpu/ppc/ppc.ad
@@ -2162,6 +2162,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
@@ -2198,6 +2202,12 @@ uint Matcher::vector_ideal_reg(int size) {
   }
 }
 
+// Vector ideal reg size corresponding to the specified size in bytes
+uint Matcher::vector_ideal_reg_size(int size) {
+  assert(MaxVectorSize == size, "");
+  return size;
+}
+
 // Limits on vector size (number of elements) loaded into vector.
 int Matcher::max_vector_size(const BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad
index aca2f4dd488ae..f6561fbd88970 100644
--- a/src/hotspot/cpu/riscv/riscv.ad
+++ b/src/hotspot/cpu/riscv/riscv.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
 // Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
@@ -2029,6 +2029,12 @@ uint Matcher::vector_ideal_reg(int len) {
   return 0;
 }
 
+// Vector ideal reg size corresponding to the specified len in bytes
+uint Matcher::vector_ideal_reg_size(int len) {
+  assert(MaxVectorSize >= len, "");
+  return MaxVectorSize;
+}
+
 int Matcher::scalable_vector_reg_size(const BasicType bt) {
   return Matcher::max_vector_size(bt);
 }
diff --git a/src/hotspot/cpu/riscv/riscv_v.ad b/src/hotspot/cpu/riscv/riscv_v.ad
index 1e99b2ece2914..c51aad8e8fecb 100644
--- a/src/hotspot/cpu/riscv/riscv_v.ad
+++ b/src/hotspot/cpu/riscv/riscv_v.ad
@@ -138,6 +138,10 @@ source %{
   bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen) {
     return false;
   }
+
+  bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+    return !is_subword_type(elem_bt);
+  }
 %}
 
 definitions %{
diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad
index c32064be86d87..5771d0a77c5cb 100644
--- a/src/hotspot/cpu/s390/s390.ad
+++ b/src/hotspot/cpu/s390/s390.ad
@@ -1,5 +1,5 @@
 //
-// Copyright (c) 2017, 2024, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2017, 2025, Oracle and/or its affiliates. All rights reserved.
 // Copyright (c) 2017, 2024 SAP SE. All rights reserved.
 // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 //
@@ -1809,6 +1809,10 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   return false;
 }
 
+bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+  return false;
+}
+
 const RegMask* Matcher::predicate_reg_mask(void) {
   return nullptr;
 }
@@ -1847,6 +1851,12 @@ uint Matcher::vector_ideal_reg(int size) {
   }
 }
 
+// Vector ideal reg size corresponding to the specified size in bytes
+uint Matcher::vector_ideal_reg_size(int size) {
+  assert(MaxVectorSize == size, "");
+  return size;
+}
+
 // Limits on vector size (number of elements) loaded into vector.
 int Matcher::max_vector_size(const BasicType bt) {
   assert(is_java_primitive(bt), "only primitive type vectors");
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
index a7967d83a4e7f..fb4eab9c63755 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@@ -1478,23 +1478,18 @@ void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src,
   }
 }
 
-void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
-                                                XMMRegister dst, Register base,
-                                                Register idx_base,
-                                                Register offset, Register mask,
-                                                Register mask_idx, Register rtmp,
-                                                int vlen_enc) {
+void C2_MacroAssembler::vgather8b_masked(BasicType elem_bt, XMMRegister dst,
+                                         Register base, Register idx_base,
+                                         Register mask, Register mask_idx,
+                                         Register rtmp, int vlen_enc) {
   vpxor(dst, dst, dst, vlen_enc);
   if (elem_bt == T_SHORT) {
     for (int i = 0; i < 4; i++) {
-      // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
+      // dst[i] = mask[i] ? src[idx_base[i]] : 0
       Label skip_load;
       btq(mask, mask_idx);
       jccb(Assembler::carryClear, skip_load);
       movl(rtmp, Address(idx_base, i * 4));
-      if (offset != noreg) {
-        addl(rtmp, offset);
-      }
       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
       bind(skip_load);
       incq(mask_idx);
@@ -1502,14 +1497,11 @@ void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
   } else {
     assert(elem_bt == T_BYTE, "");
     for (int i = 0; i < 8; i++) {
-      // dst[i] = mask[i] ? src[offset + idx_base[i]] : 0
+      // dst[i] = mask[i] ? src[idx_base[i]] : 0
       Label skip_load;
       btq(mask, mask_idx);
       jccb(Assembler::carryClear, skip_load);
       movl(rtmp, Address(idx_base, i * 4));
-      if (offset != noreg) {
-        addl(rtmp, offset);
-      }
       pinsrb(dst, Address(base, rtmp), i);
       bind(skip_load);
       incq(mask_idx);
@@ -1517,28 +1509,21 @@ void C2_MacroAssembler::vgather8b_masked_offset(BasicType elem_bt,
   }
 }
 
-void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
-                                         Register base, Register idx_base,
-                                         Register offset, Register rtmp,
-                                         int vlen_enc) {
+void C2_MacroAssembler::vgather8b(BasicType elem_bt, XMMRegister dst,
+                                  Register base, Register idx_base,
+                                  Register rtmp, int vlen_enc) {
   vpxor(dst, dst, dst, vlen_enc);
   if (elem_bt == T_SHORT) {
     for (int i = 0; i < 4; i++) {
-      // dst[i] = src[offset + idx_base[i]]
+      // dst[i] = src[idx_base[i]]
       movl(rtmp, Address(idx_base, i * 4));
-      if (offset != noreg) {
-        addl(rtmp, offset);
-      }
       pinsrw(dst, Address(base, rtmp, Address::times_2), i);
     }
   } else {
     assert(elem_bt == T_BYTE, "");
     for (int i = 0; i < 8; i++) {
-      // dst[i] = src[offset + idx_base[i]]
+      // dst[i] = src[idx_base[i]]
       movl(rtmp, Address(idx_base, i * 4));
-      if (offset != noreg) {
-        addl(rtmp, offset);
-      }
       pinsrb(dst, Address(base, rtmp), i);
     }
   }
@@ -1567,11 +1552,10 @@ void C2_MacroAssembler::vgather8b_offset(BasicType elem_bt, XMMRegister dst,
  */
 void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
                                         Register base, Register idx_base,
-                                        Register offset, Register mask,
-                                        XMMRegister xtmp1, XMMRegister xtmp2,
-                                        XMMRegister temp_dst, Register rtmp,
-                                        Register mask_idx, Register length,
-                                        int vector_len, int vlen_enc) {
+                                        Register mask, XMMRegister xtmp1,
+                                        XMMRegister xtmp2, XMMRegister temp_dst,
+                                        Register rtmp, Register mask_idx,
+                                        Register length, int vector_len, int vlen_enc) {
   Label GATHER8_LOOP;
   assert(is_subword_type(elem_ty), "");
   movl(length, vector_len);
@@ -1585,9 +1569,9 @@ void C2_MacroAssembler::vgather_subword(BasicType elem_ty, XMMRegister dst,
   bind(GATHER8_LOOP);
     // TMP_VEC_64(temp_dst) = PICK_SUB_WORDS_FROM_GATHER_INDICES
     if (mask == noreg) {
-      vgather8b_offset(elem_ty, temp_dst, base, idx_base, offset, rtmp, vlen_enc);
+      vgather8b(elem_ty, temp_dst, base, idx_base, rtmp, vlen_enc);
     } else {
-      vgather8b_masked_offset(elem_ty, temp_dst, base, idx_base, offset, mask, mask_idx, rtmp, vlen_enc);
+      vgather8b_masked(elem_ty, temp_dst, base, idx_base, mask, mask_idx, rtmp, vlen_enc);
     }
     // TEMP_PERM_VEC(temp_dst) = PERMUTE TMP_VEC_64(temp_dst) PERM_INDEX(xtmp1)
     vpermd(temp_dst, xtmp1, temp_dst, vlen_enc == Assembler::AVX_512bit ? vlen_enc : Assembler::AVX_256bit);
diff --git a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
index dd2880d88c381..21aa899766546 100644
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@@ -489,15 +489,14 @@
 
   void efp16sh(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
 
-  void vgather_subword(BasicType elem_ty, XMMRegister dst,  Register base, Register idx_base, Register offset,
-                       Register mask, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
+  void vgather_subword(BasicType elem_ty, XMMRegister dst,  Register base, Register idx_base, Register mask,
+                       XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
                        Register midx, Register length, int vector_len, int vlen_enc);
 
-  void vgather8b_masked_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
-                               Register offset, Register mask, Register midx, Register rtmp, int vlen_enc);
-
-  void vgather8b_offset(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
-                              Register offset, Register rtmp, int vlen_enc);
+  void vgather8b_masked(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
+                        Register mask, Register midx, Register rtmp, int vlen_enc);
+  void vgather8b(BasicType elem_bt, XMMRegister dst, Register base, Register idx_base,
+                 Register rtmp, int vlen_enc);
 
   void vector_saturating_op(int opc, BasicType elem_bt, XMMRegister dst, XMMRegister src1, XMMRegister src2, bool is_unsigned, int vlen_enc);
 
diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad
index a281331cb2986..ad109af47b2c8 100644
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -2100,6 +2100,11 @@ bool Matcher::vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen
   }
 }
 
+// Return true if gather/scatter needs vector index as input.
+bool Matcher::gather_scatter_needs_vector_index(BasicType elem_bt, int vlen) {
+  return !is_subword_type(elem_bt);
+}
+
 MachOper* Matcher::pd_specialize_generic_vector_operand(MachOper* generic_opnd, uint ideal_reg, bool is_temp) {
   assert(Matcher::is_generic_vector(generic_opnd), "not generic");
   bool legacy = (generic_opnd->opcode() == LEGVEC);
@@ -2247,6 +2252,12 @@ uint Matcher::vector_ideal_reg(int size) {
   return 0;
 }
 
+// Vector ideal reg size corresponding to the specified len in bytes
+uint Matcher::vector_ideal_reg_size(int size) {
+  assert(MaxVectorSize >= size, "");
+  return size;
+}
+
 // Check for shift by small constant as well
 static bool clone_shift(Node* shift, Matcher* matcher, Matcher::MStack& mstack, VectorSet& address_visited) {
   if (shift->Opcode() == Op_LShiftX && shift->in(2)->is_Con() &&
@@ -4023,24 +4034,24 @@ instruct evgather_masked(vec dst, memory mem, vec idx, kReg mask, kReg ktmp, rRe
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegI rtmp) %{
+instruct vgather_subwordLE8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegI rtmp) %{
   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
+  match(Set dst (LoadVectorGather mem idx_base));
   effect(TEMP tmp, TEMP rtmp);
   format %{ "vector_gatherLE8 $dst, $mem, $idx_base\t! using $tmp and $rtmp as TEMP" %}
   ins_encode %{
     int vlen_enc = vector_length_encoding(this);
     BasicType elem_bt = Matcher::vector_element_basic_type(this);
     __ lea($tmp$$Register, $mem$$Address);
-    __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp$$Register, vlen_enc);
+    __ vgather8b(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp$$Register, vlen_enc);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset, rRegP tmp, rRegP idx_base_temp,
+instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, rRegP tmp, rRegP idx_base_temp,
                              vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
   predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
+  match(Set dst (LoadVectorGather mem idx_base));
   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
   format %{ "vector_gatherGT8 $dst, $mem, $idx_base\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
   ins_encode %{
@@ -4049,49 +4060,15 @@ instruct vgather_subwordGT8B(vec dst, memory mem, rRegP idx_base, immI_0 offset,
     BasicType elem_bt = Matcher::vector_element_basic_type(this);
     __ lea($tmp$$Register, $mem$$Address);
     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, noreg, $xtmp1$$XMMRegister,
+    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $xtmp1$$XMMRegister,
                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_subwordLE8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegI rtmp, rFlagsReg cr) %{
-  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
-  effect(TEMP tmp, TEMP rtmp, KILL cr);
-  format %{ "vector_gatherLE8_off $dst, $mem, $idx_base, $offset\t! using $tmp and $rtmp as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ vgather8b_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register, $rtmp$$Register, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct vgather_subwordGT8B_off(vec dst, memory mem, rRegP idx_base, rRegI offset, rRegP tmp, rRegP idx_base_temp,
-                                 vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI length, rFlagsReg cr) %{
-  predicate(is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGather mem (Binary idx_base offset)));
-  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP length, KILL cr);
-  format %{ "vector_gatherGT8_off $dst, $mem, $idx_base, $offset\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp and $length as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    int vector_len = Matcher::vector_length(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, noreg, $xtmp1$$XMMRegister,
-                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, noreg, $length$$Register, vector_len, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
+instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
   ins_encode %{
@@ -4100,15 +4077,15 @@ instruct vgather_masked_subwordLE8B_avx3(vec dst, memory mem, rRegP idx_base, im
     __ xorq($mask_idx$$Register, $mask_idx$$Register);
     __ lea($tmp$$Register, $mem$$Address);
     __ kmovql($rtmp2$$Register, $mask$$KRegister);
-    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
+    __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, immI_0 offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
+instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, kReg mask, rRegP tmp, rRegP idx_base_temp,
                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
   predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
   ins_encode %{
@@ -4119,52 +4096,15 @@ instruct vgather_masked_subwordGT8B_avx3(vec dst, memory mem, rRegP idx_base, im
     __ lea($tmp$$Register, $mem$$Address);
     __ movptr($idx_base_temp$$Register, $idx_base$$Register);
     __ kmovql($rtmp2$$Register, $mask$$KRegister);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
+    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_masked_subwordLE8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegL mask_idx, rRegP tmp, rRegI rtmp, rRegL rtmp2, rFlagsReg cr) %{
-  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
-  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
-  format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ xorq($mask_idx$$Register, $mask_idx$$Register);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ kmovql($rtmp2$$Register, $mask$$KRegister);
-    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
-                                $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vgather_masked_subwordGT8B_off_avx3(vec dst, memory mem, rRegP idx_base, rRegI offset, kReg mask, rRegP tmp, rRegP idx_base_temp,
-                                             vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegL rtmp2, rRegL mask_idx, rRegI length, rFlagsReg cr) %{
-  predicate(VM_Version::supports_avx512bw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
-  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
-  format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    int vector_len = Matcher::vector_length(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ xorq($mask_idx$$Register, $mask_idx$$Register);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
-    __ kmovql($rtmp2$$Register, $mask$$KRegister);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
-                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
+instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
   effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
   format %{ "vector_masked_gatherLE8 $dst, $mem, $idx_base, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
   ins_encode %{
@@ -4177,15 +4117,15 @@ instruct vgather_masked_subwordLE8B_avx2(vec dst, memory mem, rRegP idx_base, im
       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
     }
     __ xorl($mask_idx$$Register, $mask_idx$$Register);
-    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, noreg, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
+    __ vgather8b_masked(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
   %}
   ins_pipe( pipe_slow );
 %}
 
-instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, immI_0 offset, vec mask, rRegP tmp, rRegP idx_base_temp,
+instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, vec mask, rRegP tmp, rRegP idx_base_temp,
                                          vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
   predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
+  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base mask)));
   effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
   format %{ "vector_gatherGT8_masked $dst, $mem, $idx_base, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
   ins_encode %{
@@ -4200,53 +4140,7 @@ instruct vgather_masked_subwordGT8B_avx2(vec dst, memory mem, rRegP idx_base, im
       __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
     }
     __ xorl($mask_idx$$Register, $mask_idx$$Register);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, noreg, $rtmp2$$Register, $xtmp1$$XMMRegister,
-                       $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vgather_masked_subwordLE8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegI mask_idx, rRegP tmp, rRegI rtmp, rRegI rtmp2, rFlagsReg cr) %{
-  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) <= 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
-  effect(TEMP mask_idx, TEMP tmp, TEMP rtmp, TEMP rtmp2, KILL cr);
-  format %{ "vector_masked_gatherLE8_off $dst, $mem, $idx_base, $offset, $mask\t! using $mask_idx, $tmp, $rtmp and $rtmp2 as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
-    if (elem_bt == T_SHORT) {
-      __ movl($mask_idx$$Register, 0x55555555);
-      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
-    }
-    __ xorl($mask_idx$$Register, $mask_idx$$Register);
-    __ vgather8b_masked_offset(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base$$Register, $offset$$Register,
-                                $rtmp2$$Register, $mask_idx$$Register, $rtmp$$Register, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vgather_masked_subwordGT8B_off_avx2(vec dst, memory mem, rRegP idx_base, rRegI offset, vec mask, rRegP tmp, rRegP idx_base_temp,
-                                             vec xtmp1, vec xtmp2, vec xtmp3, rRegI rtmp, rRegI rtmp2, rRegI mask_idx, rRegI length, rFlagsReg cr) %{
-  predicate(!VM_Version::supports_avx512vlbw() && is_subword_type(Matcher::vector_element_basic_type(n)) && Matcher::vector_length_in_bytes(n) > 8);
-  match(Set dst (LoadVectorGatherMasked mem (Binary idx_base (Binary mask offset))));
-  effect(TEMP_DEF dst, TEMP tmp, TEMP idx_base_temp, TEMP xtmp1, TEMP xtmp2, TEMP xtmp3, TEMP rtmp, TEMP rtmp2, TEMP mask_idx, TEMP length, KILL cr);
-  format %{ "vector_gatherGT8_masked_off $dst, $mem, $idx_base, $offset, $mask\t! using $tmp, $idx_base_temp, $xtmp1, $xtmp2, $xtmp3, $rtmp, $rtmp2, $mask_idx and $length as TEMP" %}
-  ins_encode %{
-    int vlen_enc = vector_length_encoding(this);
-    int vector_len = Matcher::vector_length(this);
-    BasicType elem_bt = Matcher::vector_element_basic_type(this);
-    __ xorl($mask_idx$$Register, $mask_idx$$Register);
-    __ lea($tmp$$Register, $mem$$Address);
-    __ movptr($idx_base_temp$$Register, $idx_base$$Register);
-    __ vpmovmskb($rtmp2$$Register, $mask$$XMMRegister, vlen_enc);
-    if (elem_bt == T_SHORT) {
-      __ movl($mask_idx$$Register, 0x55555555);
-      __ pextl($rtmp2$$Register, $rtmp2$$Register, $mask_idx$$Register);
-    }
-    __ xorl($mask_idx$$Register, $mask_idx$$Register);
-    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $offset$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
+    __ vgather_subword(elem_bt, $dst$$XMMRegister, $tmp$$Register, $idx_base_temp$$Register, $rtmp2$$Register, $xtmp1$$XMMRegister,
                        $xtmp2$$XMMRegister, $xtmp3$$XMMRegister, $rtmp$$Register, $mask_idx$$Register, $length$$Register, vector_len, vlen_enc);
   %}
   ins_pipe( pipe_slow );
diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp
index b938d5b75608d..466b9f8d7cb4e 100644
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@@ -4360,7 +4360,7 @@ bool MatchRule::is_vector() const {
     "RoundDoubleModeV","RotateLeftV" , "RotateRightV", "LoadVector","StoreVector",
     "LoadVectorGather", "StoreVectorScatter", "LoadVectorGatherMasked", "StoreVectorScatterMasked",
     "SelectFromTwoVector", "VectorTest", "VectorLoadMask", "VectorStoreMask", "VectorBlend", "VectorInsert",
-    "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst",
+    "VectorRearrange", "VectorLoadShuffle", "VectorLoadConst", "VectorSlice",
     "VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
     "VectorCastL2X", "VectorCastF2X", "VectorCastD2X", "VectorCastF2HF", "VectorCastHF2F",
     "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
@@ -4368,7 +4368,7 @@ bool MatchRule::is_vector() const {
     "FmaVD", "FmaVF", "FmaVHF", "PopCountVI", "PopCountVL", "PopulateIndex", "VectorLongToMask",
     "CountLeadingZerosV", "CountTrailingZerosV", "SignumVF", "SignumVD", "SaturatingAddV", "SaturatingSubV",
     // Next are vector mask ops.
-    "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast",
+    "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast", "VectorMaskWiden",
     "RoundVF", "RoundVD",
     // Next are not supported currently.
     "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
diff --git a/src/hotspot/share/classfile/vmIntrinsics.hpp b/src/hotspot/share/classfile/vmIntrinsics.hpp
index 49446b53b98bb..c6254b0bcaefe 100644
--- a/src/hotspot/share/classfile/vmIntrinsics.hpp
+++ b/src/hotspot/share/classfile/vmIntrinsics.hpp
@@ -1223,9 +1223,13 @@ class methodHandle;
                                      "Ljava/lang/Class;"                                                                                       \
                                      "I"                                                                                                       \
                                      "Ljava/lang/Class;"                                                                                       \
+                                     "I"                                                                                                       \
                                      "Ljava/lang/Object;"                                                                                      \
                                      "J"                                                                                                       \
                                      "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                           \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                           \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                           \
+                                     "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                           \
                                      "Ljdk/internal/vm/vector/VectorSupport$VectorMask;"                                                       \
                                      "Ljava/lang/Object;"                                                                                      \
                                      "I[II"                                                                                                    \
@@ -1240,6 +1244,7 @@ class methodHandle;
                                       "Ljava/lang/Class;"                                                                                      \
                                       "I"                                                                                                      \
                                       "Ljava/lang/Class;"                                                                                      \
+                                      "I"                                                                                                      \
                                       "Ljava/lang/Object;"                                                                                     \
                                       "J"                                                                                                      \
                                       "Ljdk/internal/vm/vector/VectorSupport$Vector;"                                                          \
diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp
index bc259eed2d101..2c898b7503258 100644
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@@ -513,6 +513,7 @@ macro(VectorUnbox)
 macro(VectorMaskWrapper)
 macro(VectorMaskCmp)
 macro(VectorMaskCast)
+macro(VectorMaskWiden)
 macro(VectorTest)
 macro(VectorBlend)
 macro(VectorRearrange)
@@ -535,6 +536,7 @@ macro(VectorUCastS2X)
 macro(VectorUCastI2X)
 macro(VectorizedHashCode)
 macro(VectorInsert)
+macro(VectorSlice)
 macro(MaskAll)
 macro(AndVMask)
 macro(OrVMask)
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 783631bf08d89..fb0de99f6e09b 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -722,9 +722,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
   case vmIntrinsics::_VectorStoreMaskedOp:
     return inline_vector_mem_masked_operation(/*is_store=*/true);
   case vmIntrinsics::_VectorGatherOp:
-    return inline_vector_gather_scatter(/*is_scatter*/ false);
+    return inline_vector_gather_scatter(/*is_scatter=*/ false);
   case vmIntrinsics::_VectorScatterOp:
-    return inline_vector_gather_scatter(/*is_scatter*/ true);
+    return inline_vector_gather_scatter(/*is_scatter=*/ true);
   case vmIntrinsics::_VectorReductionCoerced:
     return inline_vector_reduction();
   case vmIntrinsics::_VectorTest:
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
index ad1ce71c374bf..0bee6dfb0558b 100644
--- a/src/hotspot/share/opto/library_call.hpp
+++ b/src/hotspot/share/opto/library_call.hpp
@@ -383,6 +383,8 @@ class LibraryCallKit : public GraphKit {
   bool inline_vector_select_from_two_vectors();
 
   Node* gen_call_to_vector_math(int vector_api_op_id, BasicType bt, int num_elem, Node* opd1, Node* opd2);
+  Node* gen_gather_load_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, const TypeVect* vector_type);
+  Node* gen_gather_load_masked_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2, Node* indexes3, Node* mask, const TypeVect* vector_type);
 
   enum VectorMaskUseType {
     VecMaskUseLoad  = 1 << 0,
diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp
index e34a43cc1e2f6..95ca44286965a 100644
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@@ -2440,6 +2440,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
       n->del_req(4);
       break;
     }
+    case Op_VectorSlice:
     case Op_SelectFromTwoVector:
     case Op_LoopLimit: {
       Node* pair1 = new BinaryNode(n->in(1), n->in(2));
@@ -2517,22 +2518,7 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
       n->del_req(3);
       break;
     }
-    case Op_LoadVectorGather:
-      if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
-        Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
-        n->set_req(MemNode::ValueIn, pair);
-        n->del_req(MemNode::ValueIn+1);
-      }
-      break;
-    case Op_LoadVectorGatherMasked:
-      if (is_subword_type(n->bottom_type()->is_vect()->element_basic_type())) {
-        Node* pair2 = new BinaryNode(n->in(MemNode::ValueIn + 1), n->in(MemNode::ValueIn + 2));
-        Node* pair1 = new BinaryNode(n->in(MemNode::ValueIn), pair2);
-        n->set_req(MemNode::ValueIn, pair1);
-        n->del_req(MemNode::ValueIn+2);
-        n->del_req(MemNode::ValueIn+1);
-        break;
-      } // fall-through
+    case Op_LoadVectorGatherMasked: // fall-through
     case Op_StoreVectorScatter: {
       Node* pair = new BinaryNode(n->in(MemNode::ValueIn), n->in(MemNode::ValueIn+1));
       n->set_req(MemNode::ValueIn, pair);
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index baf43b0d5388c..4fe3029046a43 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -343,6 +343,9 @@ class Matcher : public PhaseTransform {
 
   static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
 
+  // Return true if gather/scatter needs vector index as input.
+  static bool gather_scatter_needs_vector_index(BasicType elem_bt, int vlen);
+
   static const RegMask* predicate_reg_mask(void);
 
   // Vector width in bytes
@@ -365,6 +368,8 @@ class Matcher : public PhaseTransform {
 
   // Vector ideal reg
   static uint vector_ideal_reg(int len);
+  // Vector ideal reg size
+  static uint vector_ideal_reg_size(int len);
 
   // Vector length
   static uint vector_length(const Node* n);
diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp
index 1cb9009ef276b..a84df10a7b8f4 100644
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@@ -192,6 +192,7 @@ class StoreVectorScatterNode;
 class StoreVectorScatterMaskedNode;
 class VerifyVectorAlignmentNode;
 class VectorMaskCmpNode;
+class VectorMaskWidenNode;
 class VectorUnboxNode;
 class VectorSet;
 class VectorReinterpretNode;
@@ -748,6 +749,7 @@ class Node {
         DEFINE_CLASS_ID(NegV, Vector, 8)
         DEFINE_CLASS_ID(SaturatingVector, Vector, 9)
         DEFINE_CLASS_ID(MulVL, Vector, 10)
+        DEFINE_CLASS_ID(VectorMaskWiden, Vector, 11)
       DEFINE_CLASS_ID(Con, Type, 8)
           DEFINE_CLASS_ID(ConI, Con, 0)
       DEFINE_CLASS_ID(SafePointScalarMerge, Type, 9)
@@ -1009,6 +1011,7 @@ class Node {
   DEFINE_CLASS_QUERY(Type)
   DEFINE_CLASS_QUERY(Vector)
   DEFINE_CLASS_QUERY(VectorMaskCmp)
+  DEFINE_CLASS_QUERY(VectorMaskWiden)
   DEFINE_CLASS_QUERY(VectorUnbox)
   DEFINE_CLASS_QUERY(VectorReinterpret)
   DEFINE_CLASS_QUERY(CompressV)
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
index e33d7b1968682..9984e0bf4626c 100644
--- a/src/hotspot/share/opto/vectorIntrinsics.cpp
+++ b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -1124,18 +1124,144 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) {
   return true;
 }
 
-// <C,
-//  V extends Vector<?>,
-//  W extends Vector<Integer>,
-//  S extends VectorSpecies<E>,
-//  M extends VectorMask<E>,
-//  E>
-// V loadWithMap(Class<? extends V> vectorClass, Class<M> maskClass, Class<E> elementType, int length,
-//               Class<? extends Vector<Integer>> vectorIndexClass,
-//               Object base, long offset, // Unsafe addressing
-//               W index_vector, M m,
-//               C container, int index, int[] indexMap, int indexM, S s, // Arguments for default implementation
-//               LoadVectorOperationWithMap<C, V, E, S, M> defaultImpl)
+Node* LibraryCallKit::gen_gather_load_subword(Node* addr, Node* indexes, Node* indexes1, Node* indexes2,
+                                              Node* indexes3, const TypeVect* vector_type) {
+  BasicType elem_bt = vector_type->element_basic_type();
+  uint elem_num = vector_type->length();
+  const TypeVect* index_vect_type = indexes->bottom_type()->isa_vect();
+  const TypePtr* addr_type = gvn().type(addr)->isa_ptr();
+  Node* addr_mem = memory(addr);
+
+  // The first gather.
+  Node* vgather = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes));
+
+  uint index_elem_num = index_vect_type != nullptr ? index_vect_type->length() : 0;
+  uint vector_reg_size = Matcher::vector_ideal_reg_size(vector_type->length_in_bytes());
+  uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt);
+  // The second gather.
+  if (indexes1 != nullptr) {
+    assert(index_vect_type != nullptr, "indexes must be a vector");
+    assert(Type::equals(indexes1->bottom_type(), index_vect_type), "invalid vector type");
+    Node* vgather1 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes1));
+    // Merge the second gather with the first gather result.
+    Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num));
+    Node* vslice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx));
+    vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type));
+  }
+
+  // The third and fourth gathers for byte type.
+  if (indexes2 != nullptr) {
+    assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather load");
+    assert(indexes3 != nullptr, "indexes3 must be non-null");
+    assert(Type::equals(indexes2->bottom_type(), index_vect_type), "invalid vector type");
+    assert(Type::equals(indexes3->bottom_type(), index_vect_type), "invalid vector type");
+    Node* vgather2 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes2));
+    // Merge the third gather with previous results.
+    Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num));
+    Node* vslice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx));
+    vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type));
+
+    Node* vgather3 = gvn().transform(new LoadVectorGatherNode(control(), addr_mem, addr, addr_type, vector_type, indexes3));
+    // Merge the fourth gather with previous results.
+    idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num));
+    vslice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx));
+    vgather = gvn().transform(new OrVNode(vgather, vslice, vector_type));
+  }
+  return vgather;
+}
+
+Node* LibraryCallKit::gen_gather_load_masked_subword(Node* addr, Node* indexes, Node* indexes1,
+                                                     Node* indexes2, Node* indexes3, Node* mask,
+                                                     const TypeVect* vector_type) {
+  BasicType elem_bt = vector_type->element_basic_type();
+  const TypeVect* index_vect_type = indexes->bottom_type()->isa_vect();
+  const TypePtr* addr_type = gvn().type(addr)->isa_ptr();
+  Node* addr_mem = memory(addr);
+
+  // Case for architectures that do not support subword vector gather with vector index.
+  // The mask needs to be kept as it is.
+  if (index_vect_type == nullptr) {
+    return gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes, mask));
+  }
+
+  // The mask input with subword type needs to be widened to int type. And the element
+  // number of the new mask is the same as the index vector.
+  uint index_elem_num = index_vect_type->length();
+  const TypeVect* mask_vt = TypeVect::makemask(T_INT, index_elem_num);
+  Node* vmask_temp = mask;
+  Node* vmask = nullptr;
+
+  // The first masked vector gather with vector index. Generate a new vector mask by widening
+  // the lower half of the mask to int type. For byte vector, it maybe the lowest 1/4 part of
+  // the mask.
+  if (elem_bt == T_BYTE) {
+    const TypeVect* mask_vt_s = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT));
+    vmask_temp = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_s, /* is_lo */true));
+    vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */true));
+  } else {
+    vmask = gvn().transform(new VectorMaskWidenNode(mask, mask_vt, /* is_lo */true));
+  }
+  Node* vgather = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes, vmask));
+
+  // The second masked vector gather with vector index.
+  uint vector_reg_size = Matcher::vector_ideal_reg_size(vector_type->length_in_bytes());
+  uint max_elem_num = vector_reg_size / type2aelembytes(elem_bt);
+  if (indexes1 != nullptr) {
+    assert(index_vect_type != nullptr, "indexes must be a vector");
+    assert(Type::equals(indexes1->bottom_type(), index_vect_type), "invalid vector type");
+
+    // Generate a new vector mask by widening the higher half of the mask to int type. For byte vector,
+    // it maybe the 2/4 part of the mask starting from the lowest bit.
+    vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */false));
+    Node* vgather1 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes1, vmask));
+    // Merge the second gather with the first gather result.
+    Node* idx = gvn().makecon(TypeInt::make(max_elem_num - index_elem_num));
+    Node* slice = gvn().transform(new VectorSliceNode(vgather1, vgather1, idx));
+    vgather = gvn().transform(new OrVNode(vgather, slice, vector_type));
+  }
+
+  // The third and fourth masked vector gathers for byte vector.
+  if (indexes2 != nullptr) {
+    assert(elem_bt == T_BYTE, "only byte vector needs more than 2 times of gather load");
+    assert(indexes3 != nullptr, "indexes3 must be non-null");
+    assert(Type::equals(indexes2->bottom_type(), index_vect_type), "invalid vector type");
+    assert(Type::equals(indexes3->bottom_type(), index_vect_type), "invalid vector type");
+
+    // The third masked vector gather with vector index. The new vector mask is widened from the 3/4
+    // part of the input mask.
+    const TypeVect* mask_vt_s = TypeVect::makemask(T_SHORT, MaxVectorSize / type2aelembytes(T_SHORT));
+    vmask_temp = gvn().transform(new VectorMaskWidenNode(mask, mask_vt_s, /* is_lo */false));
+    vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */true));
+    Node* vgather2 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes2, vmask));
+    // Merge the third gather with previous results.
+    Node* idx = gvn().makecon(TypeInt::make(max_elem_num - 2 * index_elem_num));
+    Node* slice = gvn().transform(new VectorSliceNode(vgather2, vgather2, idx));
+    vgather = gvn().transform(new OrVNode(vgather, slice, vector_type));
+
+    // The fourth masked vector gather with vector index. The new vector mask is widened from the 4/4
+    // part of the input mask.
+    vmask = gvn().transform(new VectorMaskWidenNode(vmask_temp, mask_vt, /* is_lo */false));
+    Node* vgather3 = gvn().transform(new LoadVectorGatherMaskedNode(control(), addr_mem, addr, addr_type, vector_type, indexes3, vmask));
+    // Merge the fourth gather with previous results.
+    idx = gvn().makecon(TypeInt::make(max_elem_num - 3 * index_elem_num));
+    slice = gvn().transform(new VectorSliceNode(vgather3, vgather3, idx));
+    vgather = gvn().transform(new OrVNode(vgather, slice, vector_type));
+  }
+  return vgather;
+}
+
+//
+//  <C,
+//   V extends Vector<E>,
+//   W extends Vector<Integer>,
+//   M extends VectorMask<E>,_
+//   E>
+//   V loadWithMap(Class<? extends V> vClass, Class<M> mClass, Class<E> eClass, int length,
+//                 Class<? extends Vector<Integer>> vectorIndexClass, int indexLength,
+//                 Object base, long offset,
+//                 W indexVector1, W index_vector2, W index_vector3, W index_vector4,
+//                 M m, C container, int index, int[] indexMap, int indexM, S s,
+//                 LoadVectorOperationWithMap<C, V, S, M> defaultImpl)
 //
 //  <C,
 //   V extends Vector<E>,
@@ -1143,7 +1269,8 @@ bool LibraryCallKit::inline_vector_mem_masked_operation(bool is_store) {
 //   M extends VectorMask<E>,
 //   E>
 //  void storeWithMap(Class<? extends V> vectorClass, Class<M> maskClass, Class<E> elementType,
-//                    int length, Class<? extends Vector<Integer>> vectorIndexClass, Object base, long offset,    // Unsafe addressing
+//                    int length, Class<? extends Vector<Integer>> vectorIndexClass,
+//                    int indexLength, Object base, long offset, // Unsafe addressing
 //                    W index_vector, V v, M m,
 //                    C container, int index, int[] indexMap, int indexM, // Arguments for default implementation
 //                    StoreVectorOperationWithMap<C, V, M, E> defaultImpl)
@@ -1154,14 +1281,17 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
   const TypeInstPtr* elem_klass       = gvn().type(argument(2))->isa_instptr();
   const TypeInt*     vlen             = gvn().type(argument(3))->isa_int();
   const TypeInstPtr* vector_idx_klass = gvn().type(argument(4))->isa_instptr();
+  const TypeInt*     idx_vlen         = gvn().type(argument(5))->isa_int();
 
   if (vector_klass == nullptr || elem_klass == nullptr || vector_idx_klass == nullptr || vlen == nullptr ||
-      vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr || vector_idx_klass->const_oop() == nullptr || !vlen->is_con()) {
-    log_if_needed("  ** missing constant: vclass=%s etype=%s vlen=%s viclass=%s",
+      idx_vlen == nullptr || vector_klass->const_oop() == nullptr || elem_klass->const_oop() == nullptr ||
+      vector_idx_klass->const_oop() == nullptr || !vlen->is_con() || !idx_vlen->is_con()) {
+    log_if_needed("  ** missing constant: vclass=%s etype=%s vlen=%s viclass=%s idx_vlen=%s",
                     NodeClassNames[argument(0)->Opcode()],
                     NodeClassNames[argument(2)->Opcode()],
                     NodeClassNames[argument(3)->Opcode()],
-                    NodeClassNames[argument(4)->Opcode()]);
+                    NodeClassNames[argument(4)->Opcode()],
+                    NodeClassNames[argument(5)->Opcode()]);
     return false; // not enough info for intrinsification
   }
 
@@ -1178,8 +1308,10 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
 
   BasicType elem_bt = elem_type->basic_type();
   int num_elem = vlen->get_con();
+  int idx_num_elem = idx_vlen->get_con();
 
-  const Type* vmask_type = gvn().type(is_scatter ? argument(10) : argument(9));
+  Node* m = is_scatter ? argument(11) : argument(13);
+  const Type* vmask_type = gvn().type(m);
   bool is_masked_op = vmask_type != TypePtr::NULL_PTR;
   if (is_masked_op) {
     if (mask_klass == nullptr || mask_klass->const_oop() == nullptr) {
@@ -1215,25 +1347,50 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
     }
   }
 
-  // Check that the vector holding indices is supported by architecture
-  // For sub-word gathers expander receive index array.
-  if (!is_subword_type(elem_bt) && !arch_supports_vector(Op_LoadVector, num_elem, T_INT, VecMaskNotUsed)) {
+  bool needs_vector_index = Matcher::gather_scatter_needs_vector_index(elem_bt, num_elem);
+  if (needs_vector_index) {
+    // Check that the vector holding indices is supported by architecture
+    if (!arch_supports_vector(Op_LoadVector, idx_num_elem, T_INT, VecMaskNotUsed)) {
       log_if_needed("  ** not supported: arity=%d op=%s/loadindex vlen=%d etype=int is_masked_op=%d",
-                      is_scatter, is_scatter ? "scatter" : "gather",
-                      num_elem, is_masked_op ? 1 : 0);
+                    is_scatter, is_scatter ? "scatter" : "gather",
+                    idx_num_elem, is_masked_op ? 1 : 0);
       return false; // not supported
+    }
+
+    // Check more ops that are necessary to finish the whole subword gather with vector indexes.
+    if (!is_scatter && gvn().type(argument(10)) != TypePtr::NULL_PTR) {
+      if (!arch_supports_vector(Op_VectorSlice, num_elem, elem_bt, VecMaskNotUsed) ||
+          !arch_supports_vector(Op_OrV, num_elem, elem_bt, VecMaskNotUsed)) {
+        log_if_needed("  ** not supported: op=gather/merge vlen=%d etype=%s is_masked_op=%d",
+                      num_elem, type2name(elem_bt), is_masked_op ? 1 : 0);
+        return false; // not supported
+      }
+      if (is_masked_op && !arch_supports_vector(Op_VectorMaskWiden, idx_num_elem, T_INT, VecMaskNotUsed)) {
+        log_if_needed("  ** not supported: op=gather/maskwiden vlen=%d etype=%s is_masked_op=1",
+                      idx_num_elem, type2name(elem_bt));
+        return false; // not supported
+      }
+    }
   }
 
-  Node* base = argument(5);
-  Node* offset = ConvL2X(argument(6));
+  Node* base = argument(6);
+  Node* offset = ConvL2X(argument(7));
 
   // Save state and restore on bailout
   uint old_sp = sp();
   SafePointNode* old_map = clone_map();
 
-  Node* addr = make_unsafe_address(base, offset, elem_bt, true);
+  Node* addr = nullptr;
+  if (needs_vector_index) {
+    addr = make_unsafe_address(base, offset, elem_bt, true);
+  } else {
+    assert(is_subword_type(elem_bt), "Only subword gather operation supports non-vector indexes");
+    assert(!is_scatter, "Only supports gather operation for subword types now");
+    Node* index = argument(15);
+    addr = array_element_address(base, index, elem_bt);
+  }
 
-  const TypePtr *addr_type = gvn().type(addr)->isa_ptr();
+  const TypePtr* addr_type = gvn().type(addr)->isa_ptr();
   const TypeAryPtr* arr_type = addr_type->isa_aryptr();
 
   // The array must be consistent with vector type
@@ -1255,26 +1412,66 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
     return false;
   }
 
-  Node* index_vect = nullptr;
+  // Get the indexes for gather/scatter.
+  Node* indexes = nullptr;
   const TypeInstPtr* vbox_idx_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_idx_klass);
-  if (!is_subword_type(elem_bt)) {
-    index_vect = unbox_vector(argument(8), vbox_idx_type, T_INT, num_elem);
-    if (index_vect == nullptr) {
+  if (!needs_vector_index) {
+    Node* indexMap = argument(16);
+    Node* indexM   = argument(17);
+    indexes = array_element_address(indexMap, indexM, T_INT);
+  } else {
+    // Get the first index vector.
+    indexes = unbox_vector(argument(9), vbox_idx_type, T_INT, idx_num_elem);
+    if (indexes == nullptr) {
       set_map(old_map);
       set_sp(old_sp);
       return false;
     }
   }
 
+  // Get other index vectors if they are not nullptr for subword gather operation.
+  Node* indexes1 = nullptr;
+  Node* indexes2 = nullptr;
+  Node* indexes3 = nullptr;
+  if (!is_scatter && needs_vector_index) {
+    // Get the second index vector if they are not nullptr.
+    if (gvn().type(argument(10)) != TypePtr::NULL_PTR) {
+      assert(is_subword_type(elem_bt), "Only subword gather needs more index vectors");
+      indexes1 = unbox_vector(argument(10), vbox_idx_type, T_INT, idx_num_elem);
+      if (indexes1 == nullptr) {
+        set_map(old_map);
+        set_sp(old_sp);
+        return false;
+      }
+    }
+
+    // Get the third and fourth index vectors if they are not nullptr.
+    if (gvn().type(argument(11)) != TypePtr::NULL_PTR) {
+      assert(elem_bt == T_BYTE, "Only byte gather needs more than 2 index vectors");
+      if (gvn().type(argument(12)) == TypePtr::NULL_PTR) {
+        set_map(old_map);
+        set_sp(old_sp);
+        return false;
+      }
+
+      indexes2 = unbox_vector(argument(11), vbox_idx_type, T_INT, idx_num_elem);
+      indexes3 = unbox_vector(argument(12), vbox_idx_type, T_INT, idx_num_elem);
+      if (indexes2 == nullptr || indexes3 == nullptr) {
+        set_map(old_map);
+        set_sp(old_sp);
+        return false;
+      }
+    }
+  }
+
+  // Get the vector mask value.
   Node* mask = nullptr;
   if (is_masked_op) {
     ciKlass* mbox_klass = mask_klass->const_oop()->as_instance()->java_lang_Class_klass();
     const TypeInstPtr* mbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, mbox_klass);
-    mask = unbox_vector(is_scatter ? argument(10) : argument(9), mbox_type, elem_bt, num_elem);
+    mask = unbox_vector(m, mbox_type, elem_bt, num_elem);
     if (mask == nullptr) {
-      log_if_needed("  ** unbox failed mask=%s",
-                    is_scatter ? NodeClassNames[argument(10)->Opcode()]
-                               : NodeClassNames[argument(9)->Opcode()]);
+      log_if_needed("  ** unbox failed mask=%s", NodeClassNames[m->Opcode()]);
       set_map(old_map);
       set_sp(old_sp);
       return false;
@@ -1283,7 +1480,7 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
 
   const TypeVect* vector_type = TypeVect::make(elem_bt, num_elem);
   if (is_scatter) {
-    Node* val = unbox_vector(argument(9), vbox_type, elem_bt, num_elem);
+    Node* val = unbox_vector(argument(10), vbox_type, elem_bt, num_elem);
     if (val == nullptr) {
       set_map(old_map);
       set_sp(old_sp);
@@ -1293,29 +1490,24 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
 
     Node* vstore = nullptr;
     if (mask != nullptr) {
-      vstore = gvn().transform(new StoreVectorScatterMaskedNode(control(), memory(addr), addr, addr_type, val, index_vect, mask));
+      vstore = gvn().transform(new StoreVectorScatterMaskedNode(control(), memory(addr), addr, addr_type, val, indexes, mask));
     } else {
-      vstore = gvn().transform(new StoreVectorScatterNode(control(), memory(addr), addr, addr_type, val, index_vect));
+      vstore = gvn().transform(new StoreVectorScatterNode(control(), memory(addr), addr, addr_type, val, indexes));
     }
     set_memory(vstore, addr_type);
   } else {
     Node* vload = nullptr;
-    Node* index    = argument(11);
-    Node* indexMap = argument(12);
-    Node* indexM   = argument(13);
     if (mask != nullptr) {
       if (is_subword_type(elem_bt)) {
-        Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
-        vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, mask, index));
+        vload = gen_gather_load_masked_subword(addr, indexes, indexes1, indexes2, indexes3, mask, vector_type);
       } else {
-        vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, index_vect, mask));
+        vload = gvn().transform(new LoadVectorGatherMaskedNode(control(), memory(addr), addr, addr_type, vector_type, indexes, mask));
       }
     } else {
       if (is_subword_type(elem_bt)) {
-        Node* index_arr_base = array_element_address(indexMap, indexM, T_INT);
-        vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_arr_base, index));
+        vload = gen_gather_load_subword(addr, indexes, indexes1, indexes2, indexes3, vector_type);
       } else {
-        vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, index_vect));
+        vload = gvn().transform(new LoadVectorGatherNode(control(), memory(addr), addr, addr_type, vector_type, indexes));
       }
     }
     Node* box = box_vector(vload, vbox_type, elem_bt, num_elem);
@@ -1323,7 +1515,6 @@ bool LibraryCallKit::inline_vector_gather_scatter(bool is_scatter) {
   }
 
   destruct_map_clone(old_map);
-
   C->set_max_vector_size(MAX2(C->max_vector_size(), (uint)(num_elem * type2aelembytes(elem_bt))));
   return true;
 }
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index 084b70a690653..94cdfdc6cea4d 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -1041,19 +1041,31 @@ Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const Type
   uint vlen = vt->length();
   BasicType bt = vt->element_basic_type();
 
+  BasicType mask_bt = bt;
+  uint mask_vlen = vlen;
+  if (vopc == Op_LoadVectorGather && is_subword_type(bt)) {
+    // It uses the index vector's type as the mask type for subword gather load.
+    const TypeVect* index_vt = node->in(MemNode::ValueIn)->bottom_type()->isa_vect();
+    if (index_vt == nullptr) {
+      return nullptr;
+    }
+    mask_bt = index_vt->element_basic_type();
+    mask_vlen = index_vt->length();
+  }
+
   // Predicated vectors do not need to add another mask input
   if (node->is_predicated_vector() || !Matcher::has_predicated_vectors() ||
       !Matcher::match_rule_supported_vector_masked(vopc, vlen, bt) ||
-      !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt)) {
+      !Matcher::match_rule_supported_vector(Op_VectorMaskGen, mask_vlen, mask_bt)) {
     return nullptr;
   }
 
   Node* mask = nullptr;
   // Generate a vector mask for vector operation whose vector length is lower than the
   // hardware supported max vector length.
-  if (vt->length_in_bytes() < (uint)MaxVectorSize) {
+  if (mask_vlen * type2aelembytes(mask_bt) < (uint)MaxVectorSize) {
     Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(vlen))));
-    mask = gvn->transform(VectorMaskGenNode::make(length, bt, vlen));
+    mask = gvn->transform(VectorMaskGenNode::make(length, mask_bt, mask_vlen));
   } else {
     return nullptr;
   }
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index 36706a7b7a14b..eac2c9bc9cc6b 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -1112,25 +1112,18 @@ class LoadVectorNode : public LoadNode {
 // Load Vector from memory via index map
 class LoadVectorGatherNode : public LoadVectorNode {
  public:
-  LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* offset = nullptr)
+  LoadVectorGatherNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices)
     : LoadVectorNode(c, mem, adr, at, vt) {
     init_class_id(Class_LoadVectorGather);
     add_req(indices);
     DEBUG_ONLY(bool is_subword = is_subword_type(vt->element_basic_type()));
     assert(is_subword || indices->bottom_type()->is_vect(), "indices must be in vector");
-    assert(is_subword || !offset, "");
     assert(req() == MemNode::ValueIn + 1, "match_edge expects that index input is in MemNode::ValueIn");
-    if (offset) {
-      add_req(offset);
-    }
   }
 
   virtual int Opcode() const;
   virtual uint match_edge(uint idx) const {
-     return idx == MemNode::Address ||
-            idx == MemNode::ValueIn ||
-            ((is_subword_type(vect_type()->element_basic_type())) &&
-              idx == MemNode::ValueIn + 1);
+     return idx == MemNode::Address || idx == MemNode::ValueIn;
   }
   virtual int store_Opcode() const {
     // Ensure it is different from any store opcode to avoid folding when indices are used
@@ -1249,23 +1242,19 @@ class LoadVectorMaskedNode : public LoadVectorNode {
 // Load Vector from memory via index map under the influence of a predicate register(mask).
 class LoadVectorGatherMaskedNode : public LoadVectorNode {
  public:
-  LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask, Node* offset = nullptr)
+  LoadVectorGatherMaskedNode(Node* c, Node* mem, Node* adr, const TypePtr* at, const TypeVect* vt, Node* indices, Node* mask)
     : LoadVectorNode(c, mem, adr, at, vt) {
     init_class_id(Class_LoadVectorGatherMasked);
     add_req(indices);
     add_req(mask);
     assert(req() == MemNode::ValueIn + 2, "match_edge expects that last input is in MemNode::ValueIn+1");
-    if (is_subword_type(vt->element_basic_type())) {
-      add_req(offset);
-    }
+    assert(is_subword_type(vt->element_basic_type()) || indices->bottom_type()->is_vect(), "indices must be in vector");
   }
 
   virtual int Opcode() const;
   virtual uint match_edge(uint idx) const { return idx == MemNode::Address ||
                                                    idx == MemNode::ValueIn ||
-                                                   idx == MemNode::ValueIn + 1 ||
-                                                   (is_subword_type(vect_type()->is_vect()->element_basic_type()) &&
-                                                   idx == MemNode::ValueIn + 2); }
+                                                   idx == MemNode::ValueIn + 1; }
   virtual int store_Opcode() const {
     // Ensure it is different from any store opcode to avoid folding when indices and mask are used
     return -1;
@@ -1745,6 +1734,24 @@ class VectorRearrangeNode : public VectorNode {
   Node* vec_shuffle() const { return in(2); }
 };
 
+// Generate a vector by slicing the two source vectors based on an index.
+//
+// Copy the indexed byte up to the last byte of the first source vector
+// to the bottom of the result vector, then fill the remainder of the
+// result starting from the first byte of the second source vector.
+//
+// E.g. src1 = [hgfedcba] src2 = [ponmlkji] index = 3
+//      dst = [kjihgfed]
+class VectorSliceNode : public VectorNode {
+ public:
+  VectorSliceNode(Node* vec1, Node* vec2, Node* index)
+    : VectorNode(vec1, vec2, index, vec1->bottom_type()->is_vect()) {
+    assert(index->bottom_type()->isa_int(), "index must be an integral value");
+    assert(index->is_Con(), "index must be a constant");
+  }
+
+  virtual int Opcode() const;
+};
 
 // Select elements from two source vectors based on the wrapped indexes held in
 // the first vector.
@@ -1804,6 +1811,28 @@ class VectorMaskCastNode : public VectorNode {
   virtual int Opcode() const;
 };
 
+// Unpack the elements to twice size.
+class VectorMaskWidenNode : public VectorNode {
+ private:
+  // "_is_lo" is used to denote whether the lower half or
+  // the upper half of the elements are widened.
+  // E.g. src = [1111 0101]
+  //      _is_lo = true, dst = [0001 0001]
+  //      _is_lo = false, dst = [0101 0101]
+  bool _is_lo;
+
+ public:
+  VectorMaskWidenNode(Node* in, const TypeVect* vt, bool is_lo) : VectorNode(in, vt), _is_lo(is_lo) {
+    init_class_id(Class_VectorMaskWiden);
+    const TypeVect* in_vt = in->bottom_type()->is_vect();
+    assert(type2aelembytes(in_vt->element_basic_type()) == type2aelembytes(vt->element_basic_type()) / 2, "must be half size");
+  }
+
+  bool is_lo() const { return _is_lo; }
+  virtual int Opcode() const;
+  virtual uint size_of() const { return sizeof(*this); }
+};
+
 // This is intended for use as a simple reinterpret node that has no cast.
 class VectorReinterpretNode : public VectorNode {
  private:
diff --git a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
index cbf30da228934..0808f37d2370c 100644
--- a/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
+++ b/src/java.base/share/classes/jdk/internal/vm/vector/VectorSupport.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -451,8 +451,8 @@ public interface LoadVectorOperationWithMap<C,
     V loadWithMap(Class<? extends V> vClass, Class<M> mClass, Class<E> eClass,
                   int length,
                   Class<? extends Vector<Integer>> vectorIndexClass,
-                  Object base, long offset,
-                  W index_vector,
+                  int indexLength, Object base, long offset,
+                  W indexVector1, W indexVector2, W indexVector3, W indexVector4,
                   M m, C container, int index, int[] indexMap, int indexM, S s,
                   LoadVectorOperationWithMap<C, V, S, M> defaultImpl) {
         assert isNonCapturingLambda(defaultImpl) : defaultImpl;
@@ -518,7 +518,7 @@ public interface StoreVectorOperationWithMap<C,
     void storeWithMap(Class<? extends V> vClass, Class<M> mClass, Class<E> eClass,
                       int length,
                       Class<? extends Vector<Integer>> vectorIndexClass,
-                      Object base, long offset,
+                      int indexLength, Object base, long offset,
                       W index_vector,
                       V v, M m, C container, int index, int[] indexMap, int indexM,
                       StoreVectorOperationWithMap<C, V, M> defaultImpl) {
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
index ed8d273ff37db..5e608807b0564 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ByteVector.java
@@ -3117,17 +3117,30 @@ ByteVector fromArray(VectorSpecies<Byte> species,
         }
 
         // Check indices are within array bounds.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
+        }
+
+        IntVector vix2 = null;
+        IntVector vix3 = null;
+        if (vlen == idx_vlen * 4) {
+            vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset);
+            VectorIntrinsics.checkIndex(vix2, a.length);
+            vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset);
+            VectorIntrinsics.checkIndex(vix3, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, null, byte.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, null,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, vix2, vix3, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3879,17 +3892,30 @@ ByteVector fromArray0Template(Class<M> maskClass, byte[] a, int offset,
 
         // Check indices are within array bounds.
         // FIXME: Check index under mask controlling.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
+        }
+
+        IntVector vix2 = null;
+        IntVector vix3 = null;
+        if (vlen == idx_vlen * 4) {
+            vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset);
+            VectorIntrinsics.checkIndex(vix2, a.length);
+            vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset);
+            VectorIntrinsics.checkIndex(vix3, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, byte.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, m,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, vix2, vix3, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java
index 5fbf02f87bd93..5f9c6f481b8a0 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/DoubleVector.java
@@ -2910,8 +2910,8 @@ DoubleVector fromArray(VectorSpecies<Double> species,
 
         return VectorSupport.loadWithMap(
             vectorType, null, double.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, null,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3201,7 +3201,7 @@ void intoArray(double[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, null,
             a, offset, indexMap, mapOffset,
@@ -3396,8 +3396,8 @@ DoubleVector fromArray0Template(Class<M> maskClass, double[] a, int offset,
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, double.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, m,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
@@ -3512,7 +3512,7 @@ void intoArray0Template(Class<M> maskClass, double[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, m,
             a, offset, indexMap, mapOffset,
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java
index 26fbe64742d6f..9ba457fa55668 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/FloatVector.java
@@ -2916,8 +2916,8 @@ FloatVector fromArray(VectorSpecies<Float> species,
 
         return VectorSupport.loadWithMap(
             vectorType, null, float.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, null,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3188,7 +3188,7 @@ void intoArray(float[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, null,
             a, offset, indexMap, mapOffset,
@@ -3365,8 +3365,8 @@ FloatVector fromArray0Template(Class<M> maskClass, float[] a, int offset,
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, float.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, m,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
@@ -3462,7 +3462,7 @@ void intoArray0Template(Class<M> maskClass, float[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, m,
             a, offset, indexMap, mapOffset,
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java
index 076a66ed6a543..b3d4c938e8e00 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/IntVector.java
@@ -3094,8 +3094,8 @@ IntVector fromArray(VectorSpecies<Integer> species,
 
         return VectorSupport.loadWithMap(
             vectorType, null, int.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, null,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3366,7 +3366,7 @@ void intoArray(int[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, null,
             a, offset, indexMap, mapOffset,
@@ -3543,8 +3543,8 @@ IntVector fromArray0Template(Class<M> maskClass, int[] a, int offset,
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, int.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, m,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
@@ -3640,7 +3640,7 @@ void intoArray0Template(Class<M> maskClass, int[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, m,
             a, offset, indexMap, mapOffset,
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java
index 21903aa6794e8..7c42bac59d49d 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/LongVector.java
@@ -2973,8 +2973,8 @@ LongVector fromArray(VectorSpecies<Long> species,
 
         return VectorSupport.loadWithMap(
             vectorType, null, long.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, null,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3264,7 +3264,7 @@ void intoArray(long[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, null,
             a, offset, indexMap, mapOffset,
@@ -3459,8 +3459,8 @@ LongVector fromArray0Template(Class<M> maskClass, long[] a, int offset,
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, long.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, m,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
@@ -3575,7 +3575,7 @@ void intoArray0Template(Class<M> maskClass, long[] a, int offset,
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, m,
             a, offset, indexMap, mapOffset,
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java
index 0bb97da824459..7ae07f1682161 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/ShortVector.java
@@ -3118,17 +3118,21 @@ ShortVector fromArray(VectorSpecies<Short> species,
         }
 
         // Check indices are within array bounds.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, null, short.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, null,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -3865,17 +3869,21 @@ ShortVector fromArray0Template(Class<M> maskClass, short[] a, int offset,
 
         // Check indices are within array bounds.
         // FIXME: Check index under mask controlling.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, short.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, m,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
diff --git a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template
index 8084cc307e867..5113738a23261 100644
--- a/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template
+++ b/src/jdk.incubator.vector/share/classes/jdk/incubator/vector/X-Vector.java.template
@@ -3724,20 +3724,43 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
         }
 
         // Check indices are within array bounds.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
+        }
+
+#if[byte]
+        IntVector vix2 = null;
+        IntVector vix3 = null;
+        if (vlen == idx_vlen * 4) {
+            vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset);
+            VectorIntrinsics.checkIndex(vix2, a.length);
+            vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset);
+            VectorIntrinsics.checkIndex(vix3, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, null, $type$.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, null,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, vix2, vix3, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
+#else[byte]
+        return VectorSupport.loadWithMap(
+            vectorType, null, $type$.class, vsp.laneCount(),
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, null, null, null,
+            a, offset, indexMap, mapOffset, vsp,
+            (c, idx, iMap, idy, s, vm) ->
+            s.vOp(n -> c[idx + iMap[idy+n]]));
+#end[byte]
     }
 #else[byteOrShort]
     @ForceInline
@@ -3785,8 +3808,8 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 
         return VectorSupport.loadWithMap(
             vectorType, null, $type$.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, null,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, null,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(n -> c[idx + iMap[idy+n]]));
@@ -4411,7 +4434,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), null, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, null,
             a, offset, indexMap, mapOffset,
@@ -4932,20 +4955,43 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 
         // Check indices are within array bounds.
         // FIXME: Check index under mask controlling.
-        for (int i = 0; i < vsp.length(); i += lsp.length()) {
-            IntVector vix = IntVector
-                .fromArray(lsp, indexMap, mapOffset + i)
-                .add(offset);
-            VectorIntrinsics.checkIndex(vix, a.length);
+        IntVector vix0 = IntVector.fromArray(lsp, indexMap, mapOffset).add(offset);
+        VectorIntrinsics.checkIndex(vix0, a.length);
+
+        int vlen = vsp.length();
+        int idx_vlen = lsp.length();
+        IntVector vix1 = null;
+        if (vlen >= idx_vlen * 2) {
+            vix1 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen).add(offset);
+            VectorIntrinsics.checkIndex(vix1, a.length);
+        }
+
+#if[byte]
+        IntVector vix2 = null;
+        IntVector vix3 = null;
+        if (vlen == idx_vlen * 4) {
+            vix2 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 2).add(offset);
+            VectorIntrinsics.checkIndex(vix2, a.length);
+            vix3 = IntVector.fromArray(lsp, indexMap, mapOffset + idx_vlen * 3).add(offset);
+            VectorIntrinsics.checkIndex(vix3, a.length);
         }
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, $type$.class, vsp.laneCount(),
-            lsp.vectorType(),
-            a, ARRAY_BASE, null, m,
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, vix2, vix3, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
+#else[byte]
+        return VectorSupport.loadWithMap(
+            vectorType, maskClass, $type$.class, vsp.laneCount(),
+            lsp.vectorType(), lsp.length(),
+            a, ARRAY_BASE, vix0, vix1, null, null, m,
+            a, offset, indexMap, mapOffset, vsp,
+            (c, idx, iMap, idy, s, vm) ->
+            s.vOp(vm, n -> c[idx + iMap[idy+n]]));
+#end[byte]
     }
 #else[byteOrShort]
     @ForceInline
@@ -4995,8 +5041,8 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 
         return VectorSupport.loadWithMap(
             vectorType, maskClass, $type$.class, vsp.laneCount(),
-            isp.vectorType(),
-            a, ARRAY_BASE, vix, m,
+            isp.vectorType(), isp.length(),
+            a, ARRAY_BASE, vix, null, null, null, m,
             a, offset, indexMap, mapOffset, vsp,
             (c, idx, iMap, idy, s, vm) ->
             s.vOp(vm, n -> c[idx + iMap[idy+n]]));
@@ -5186,7 +5232,7 @@ public abstract class $abstractvectortype$ extends AbstractVector<$Boxtype$> {
 
         VectorSupport.storeWithMap(
             vsp.vectorType(), maskClass, vsp.elementType(), vsp.laneCount(),
-            isp.vectorType(),
+            isp.vectorType(), isp.length(),
             a, arrayAddress(a, 0), vix,
             this, m,
             a, offset, indexMap, mapOffset,
diff --git a/test/hotspot/gtest/aarch64/aarch64-asmtest.py b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
index 92868e783dcfe..64a209c36b00e 100644
--- a/test/hotspot/gtest/aarch64/aarch64-asmtest.py
+++ b/test/hotspot/gtest/aarch64/aarch64-asmtest.py
@@ -2063,6 +2063,8 @@ def generate(kind, names):
                         ["index",    "__ sve_index(z7, __ D, r5, 5);",                     "index\tz7.d, x5, #5"],
                         ["cpy",      "__ sve_cpy(z7, __ H, p3, r5);",                      "cpy\tz7.h, p3/m, w5"],
                         ["tbl",      "__ sve_tbl(z16, __ S, z17, z18);",                   "tbl\tz16.s, {z17.s}, z18.s"],
+                        ["ld1b",     "__ sve_ld1b_gather(z15, p0, r5, z16);",              "ld1b\t{z15.s}, p0/z, [x5, z16.s, uxtw]"],
+                        ["ld1h",     "__ sve_ld1h_gather(z15, p0, r5, z16);",              "ld1h\t{z15.s}, p0/z, [x5, z16.s, uxtw #1]"],
                         ["ld1w",     "__ sve_ld1w_gather(z15, p0, r5, z16);",              "ld1w\t{z15.s}, p0/z, [x5, z16.s, uxtw #2]"],
                         ["ld1d",     "__ sve_ld1d_gather(z15, p0, r5, z16);",              "ld1d\t{z15.d}, p0/z, [x5, z16.d, uxtw #3]"],
                         ["st1w",     "__ sve_st1w_scatter(z15, p0, r5, z16);",             "st1w\t{z15.s}, p0, [x5, z16.s, uxtw #2]"],
diff --git a/test/hotspot/gtest/aarch64/asmtest.out.h b/test/hotspot/gtest/aarch64/asmtest.out.h
index 0c2011592b6f4..562d8d9adb7f1 100644
--- a/test/hotspot/gtest/aarch64/asmtest.out.h
+++ b/test/hotspot/gtest/aarch64/asmtest.out.h
@@ -1049,6 +1049,8 @@
     __ sve_index(z7, __ D, r5, 5);                     //       index   z7.d, x5, #5
     __ sve_cpy(z7, __ H, p3, r5);                      //       cpy     z7.h, p3/m, w5
     __ sve_tbl(z16, __ S, z17, z18);                   //       tbl     z16.s, {z17.s}, z18.s
+    __ sve_ld1b_gather(z15, p0, r5, z16);              //       ld1b    {z15.s}, p0/z, [x5, z16.s, uxtw]
+    __ sve_ld1h_gather(z15, p0, r5, z16);              //       ld1h    {z15.s}, p0/z, [x5, z16.s, uxtw #1]
     __ sve_ld1w_gather(z15, p0, r5, z16);              //       ld1w    {z15.s}, p0/z, [x5, z16.s, uxtw #2]
     __ sve_ld1d_gather(z15, p0, r5, z16);              //       ld1d    {z15.d}, p0/z, [x5, z16.d, uxtw #3]
     __ sve_st1w_scatter(z15, p0, r5, z16);             //       st1w    {z15.s}, p0, [x5, z16.s, uxtw #2]
@@ -1387,30 +1389,30 @@
     0x9101a1a0,     0xb10a5cc8,     0xd10810aa,     0xf10fd061,
     0x120cb166,     0x321764bc,     0x52174681,     0x720c0227,
     0x9241018e,     0xb25a2969,     0xd278b411,     0xf26aad01,
-    0x14000000,     0x17ffffd7,     0x1400047d,     0x94000000,
-    0x97ffffd4,     0x9400047a,     0x3400000a,     0x34fffa2a,
-    0x34008eea,     0x35000008,     0x35fff9c8,     0x35008e88,
-    0xb400000b,     0xb4fff96b,     0xb4008e2b,     0xb500001d,
-    0xb5fff91d,     0xb5008ddd,     0x10000013,     0x10fff8b3,
-    0x10008d73,     0x90000013,     0x36300016,     0x3637f836,
-    0x36308cf6,     0x3758000c,     0x375ff7cc,     0x37588c8c,
+    0x14000000,     0x17ffffd7,     0x1400047f,     0x94000000,
+    0x97ffffd4,     0x9400047c,     0x3400000a,     0x34fffa2a,
+    0x34008f2a,     0x35000008,     0x35fff9c8,     0x35008ec8,
+    0xb400000b,     0xb4fff96b,     0xb4008e6b,     0xb500001d,
+    0xb5fff91d,     0xb5008e1d,     0x10000013,     0x10fff8b3,
+    0x10008db3,     0x90000013,     0x36300016,     0x3637f836,
+    0x36308d36,     0x3758000c,     0x375ff7cc,     0x37588ccc,
     0x128313a0,     0x528a32c7,     0x7289173b,     0x92ab3acc,
     0xd2a0bf94,     0xf2c285e8,     0x9358722f,     0x330e652f,
     0x53067f3b,     0x93577c53,     0xb34a1aac,     0xd35a4016,
     0x13946c63,     0x93c3dbc8,     0x54000000,     0x54fff5a0,
-    0x54008a60,     0x54000001,     0x54fff541,     0x54008a01,
-    0x54000002,     0x54fff4e2,     0x540089a2,     0x54000002,
-    0x54fff482,     0x54008942,     0x54000003,     0x54fff423,
-    0x540088e3,     0x54000003,     0x54fff3c3,     0x54008883,
-    0x54000004,     0x54fff364,     0x54008824,     0x54000005,
-    0x54fff305,     0x540087c5,     0x54000006,     0x54fff2a6,
-    0x54008766,     0x54000007,     0x54fff247,     0x54008707,
-    0x54000008,     0x54fff1e8,     0x540086a8,     0x54000009,
-    0x54fff189,     0x54008649,     0x5400000a,     0x54fff12a,
-    0x540085ea,     0x5400000b,     0x54fff0cb,     0x5400858b,
-    0x5400000c,     0x54fff06c,     0x5400852c,     0x5400000d,
-    0x54fff00d,     0x540084cd,     0x5400000e,     0x54ffefae,
-    0x5400846e,     0x5400000f,     0x54ffef4f,     0x5400840f,
+    0x54008aa0,     0x54000001,     0x54fff541,     0x54008a41,
+    0x54000002,     0x54fff4e2,     0x540089e2,     0x54000002,
+    0x54fff482,     0x54008982,     0x54000003,     0x54fff423,
+    0x54008923,     0x54000003,     0x54fff3c3,     0x540088c3,
+    0x54000004,     0x54fff364,     0x54008864,     0x54000005,
+    0x54fff305,     0x54008805,     0x54000006,     0x54fff2a6,
+    0x540087a6,     0x54000007,     0x54fff247,     0x54008747,
+    0x54000008,     0x54fff1e8,     0x540086e8,     0x54000009,
+    0x54fff189,     0x54008689,     0x5400000a,     0x54fff12a,
+    0x5400862a,     0x5400000b,     0x54fff0cb,     0x540085cb,
+    0x5400000c,     0x54fff06c,     0x5400856c,     0x5400000d,
+    0x54fff00d,     0x5400850d,     0x5400000e,     0x54ffefae,
+    0x540084ae,     0x5400000f,     0x54ffef4f,     0x5400844f,
     0xd40658e1,     0xd4014d22,     0xd4046543,     0xd4273f60,
     0xd44cad80,     0xd503201f,     0xd503203f,     0xd503205f,
     0xd503209f,     0xd50320bf,     0xd503219f,     0xd50323bf,
@@ -1604,76 +1606,77 @@
     0x659ca509,     0x65d8a801,     0x65dcac01,     0x655cb241,
     0x0520a1e0,     0x0521a601,     0x052281e0,     0x05238601,
     0x04a14026,     0x042244a6,     0x046344a6,     0x04a444a6,
-    0x04e544a7,     0x0568aca7,     0x05b23230,     0x853040af,
-    0xc5b040af,     0xe57080af,     0xe5b080af,     0x25034440,
-    0x254054c4,     0x25034640,     0x25415a05,     0x25834440,
-    0x25c54489,     0x250b5d3a,     0x2550dc20,     0x2518e3e1,
-    0x2518e021,     0x2518e0a1,     0x2518e121,     0x2518e1a1,
-    0x2558e3e2,     0x2558e042,     0x2558e0c2,     0x2558e142,
-    0x2598e3e3,     0x2598e063,     0x2598e0e3,     0x2598e163,
-    0x25d8e3e4,     0x25d8e084,     0x25d8e104,     0x25d8e184,
-    0x2518e407,     0x05214800,     0x05614800,     0x05a14800,
-    0x05e14800,     0x05214c00,     0x05614c00,     0x05a14c00,
-    0x05e14c00,     0x05304001,     0x05314001,     0x05a18610,
-    0x05e18610,     0x05271e11,     0x6545e891,     0x6585e891,
-    0x65c5e891,     0x6545c891,     0x6585c891,     0x65c5c891,
-    0x45b0c210,     0x45f1c231,     0x1e601000,     0x1e603000,
-    0x1e621000,     0x1e623000,     0x1e641000,     0x1e643000,
-    0x1e661000,     0x1e663000,     0x1e681000,     0x1e683000,
-    0x1e6a1000,     0x1e6a3000,     0x1e6c1000,     0x1e6c3000,
-    0x1e6e1000,     0x1e6e3000,     0x1e701000,     0x1e703000,
-    0x1e721000,     0x1e723000,     0x1e741000,     0x1e743000,
-    0x1e761000,     0x1e763000,     0x1e781000,     0x1e783000,
-    0x1e7a1000,     0x1e7a3000,     0x1e7c1000,     0x1e7c3000,
-    0x1e7e1000,     0x1e7e3000,     0xf82081f1,     0xf824011a,
-    0xf83c1376,     0xf83b22f9,     0xf82030c4,     0xf8305080,
-    0xf82f4141,     0xf8277145,     0xf83c6287,     0xf8b780d5,
-    0xf8ab0228,     0xf8bf1226,     0xf8a223cc,     0xf8bd3363,
-    0xf8b651dd,     0xf8ad423c,     0xf8b87045,     0xf8ae620a,
-    0xf8eb82fb,     0xf8ec02c4,     0xf8f11024,     0xf8f321f0,
-    0xf8ed318e,     0xf8e25071,     0xf8f540b7,     0xf8e67267,
-    0xf8ed623c,     0xf8708046,     0xf87d0083,     0xf8661290,
-    0xf86d228c,     0xf8683299,     0xf8735160,     0xf8784286,
-    0xf87f720e,     0xf86660e0,     0xb82f8353,     0xb82902ea,
-    0xb8351396,     0xb82221e3,     0xb83330f4,     0xb82450fd,
-    0xb8204209,     0xb8347097,     0xb83062ea,     0xb8ab80d9,
-    0xb8bf01b0,     0xb8b7102c,     0xb8ae22a9,     0xb8b031fa,
-    0xb8a451e4,     0xb8a843c6,     0xb8a4723d,     0xb8bd613a,
-    0xb8ef8162,     0xb8fd00e3,     0xb8e112bb,     0xb8f0210e,
-    0xb8f03336,     0xb8e552b4,     0xb8f04217,     0xb8fe7294,
-    0xb8e06264,     0xb8788284,     0xb8640358,     0xb8731102,
-    0xb868230e,     0xb87032df,     0xb864503f,     0xb86a4194,
-    0xb86070e9,     0xb8786090,     0xce2a6cdb,     0xce107db8,
-    0xce748ed6,     0xce8973bf,     0xce7480f4,     0xce6b853c,
-    0xcec0818e,     0xce788834,     0x25a0cd89,     0x25a1d093,
-    0x05803685,     0x05400c08,     0x050074c4,     0x2560d6a0,
-    0x2521c0fb,     0x05805089,     0x05403e98,     0x05025238,
-    0x25e0cd0b,     0x25e1d1d2,     0x05800e4e,     0x05402676,
-    0x05001e63,     0x25a0d1c9,     0x2521c495,     0x0583abe2,
-    0x054011ab,     0x05007cbe,     0x2560c3b7,     0x25e1c358,
-    0x05806593,     0x054064b5,     0x05000e5a,     0x2520c3f1,
-    0x25a1cc29,     0x05801468,     0x05401d71,     0x05035bb2,
-    0x04bb01f0,     0x046806dc,     0x659c0385,     0x65d909e0,
-    0x65c30415,     0x04fa10ba,     0x04611a33,     0x042e17ce,
-    0x04bf1c52,     0x0456b7d7,     0x04400008,     0x049a1417,
-    0x04509b1a,     0x041b1456,     0x0499b58b,     0x04dab938,
-    0x04991691,     0x04d395a4,     0x04d19ff6,     0x045011f2,
-    0x0417be8d,     0x041eadc1,     0x04980987,     0x052799e4,
-    0x05a49c23,     0x04c817e5,     0x044a0d2d,     0x04c901fe,
-    0x044b0343,     0x04c10839,     0x04dcac2a,     0x65c087ba,
-    0x658d8791,     0x65869d61,     0x65c78021,     0x65828c5b,
-    0x049db33e,     0x65c2b862,     0x65c0ac7d,     0x65c1b38e,
-    0x65cdab64,     0x65c19022,     0x65fc97e7,     0x65bd162a,
-    0x65b82596,     0x65a0a969,     0x65a4d697,     0x65feec8f,
-    0x65ba46bb,     0x65a4633f,     0x04c742a6,     0x049f7f18,
-    0x042c3141,     0x04b9310d,     0x047733e1,     0x04f53014,
-    0x05bb6bbf,     0x05ba6fa8,     0x65c88645,     0x4555b34d,
-    0x45cab660,     0x043138c7,     0x44589b94,     0x445a8e71,
-    0x44198b1a,     0x449b8f8b,     0x049a3797,     0x04183f14,
-    0x045926fb,     0x04c825ac,     0x040a369a,     0x65873fa2,
-    0x6586347d,     0x65982b85,     0x04412dd1,     0x0e2c116a,
-    0x4e2a1128,     0x0e6b1149,     0x4e751293,     0x0ea21020,
-    0x4ebf13dd,     0x2e321230,     0x6e321230,     0x2e6f11cd,
-    0x6e791317,     0x2eba1338,     0x6eb91317,
+    0x04e544a7,     0x0568aca7,     0x05b23230,     0x841040af,
+    0x84b040af,     0x853040af,     0xc5b040af,     0xe57080af,
+    0xe5b080af,     0x25034440,     0x254054c4,     0x25034640,
+    0x25415a05,     0x25834440,     0x25c54489,     0x250b5d3a,
+    0x2550dc20,     0x2518e3e1,     0x2518e021,     0x2518e0a1,
+    0x2518e121,     0x2518e1a1,     0x2558e3e2,     0x2558e042,
+    0x2558e0c2,     0x2558e142,     0x2598e3e3,     0x2598e063,
+    0x2598e0e3,     0x2598e163,     0x25d8e3e4,     0x25d8e084,
+    0x25d8e104,     0x25d8e184,     0x2518e407,     0x05214800,
+    0x05614800,     0x05a14800,     0x05e14800,     0x05214c00,
+    0x05614c00,     0x05a14c00,     0x05e14c00,     0x05304001,
+    0x05314001,     0x05a18610,     0x05e18610,     0x05271e11,
+    0x6545e891,     0x6585e891,     0x65c5e891,     0x6545c891,
+    0x6585c891,     0x65c5c891,     0x45b0c210,     0x45f1c231,
+    0x1e601000,     0x1e603000,     0x1e621000,     0x1e623000,
+    0x1e641000,     0x1e643000,     0x1e661000,     0x1e663000,
+    0x1e681000,     0x1e683000,     0x1e6a1000,     0x1e6a3000,
+    0x1e6c1000,     0x1e6c3000,     0x1e6e1000,     0x1e6e3000,
+    0x1e701000,     0x1e703000,     0x1e721000,     0x1e723000,
+    0x1e741000,     0x1e743000,     0x1e761000,     0x1e763000,
+    0x1e781000,     0x1e783000,     0x1e7a1000,     0x1e7a3000,
+    0x1e7c1000,     0x1e7c3000,     0x1e7e1000,     0x1e7e3000,
+    0xf82081f1,     0xf824011a,     0xf83c1376,     0xf83b22f9,
+    0xf82030c4,     0xf8305080,     0xf82f4141,     0xf8277145,
+    0xf83c6287,     0xf8b780d5,     0xf8ab0228,     0xf8bf1226,
+    0xf8a223cc,     0xf8bd3363,     0xf8b651dd,     0xf8ad423c,
+    0xf8b87045,     0xf8ae620a,     0xf8eb82fb,     0xf8ec02c4,
+    0xf8f11024,     0xf8f321f0,     0xf8ed318e,     0xf8e25071,
+    0xf8f540b7,     0xf8e67267,     0xf8ed623c,     0xf8708046,
+    0xf87d0083,     0xf8661290,     0xf86d228c,     0xf8683299,
+    0xf8735160,     0xf8784286,     0xf87f720e,     0xf86660e0,
+    0xb82f8353,     0xb82902ea,     0xb8351396,     0xb82221e3,
+    0xb83330f4,     0xb82450fd,     0xb8204209,     0xb8347097,
+    0xb83062ea,     0xb8ab80d9,     0xb8bf01b0,     0xb8b7102c,
+    0xb8ae22a9,     0xb8b031fa,     0xb8a451e4,     0xb8a843c6,
+    0xb8a4723d,     0xb8bd613a,     0xb8ef8162,     0xb8fd00e3,
+    0xb8e112bb,     0xb8f0210e,     0xb8f03336,     0xb8e552b4,
+    0xb8f04217,     0xb8fe7294,     0xb8e06264,     0xb8788284,
+    0xb8640358,     0xb8731102,     0xb868230e,     0xb87032df,
+    0xb864503f,     0xb86a4194,     0xb86070e9,     0xb8786090,
+    0xce2a6cdb,     0xce107db8,     0xce748ed6,     0xce8973bf,
+    0xce7480f4,     0xce6b853c,     0xcec0818e,     0xce788834,
+    0x25a0cd89,     0x25a1d093,     0x05803685,     0x05400c08,
+    0x050074c4,     0x2560d6a0,     0x2521c0fb,     0x05805089,
+    0x05403e98,     0x05025238,     0x25e0cd0b,     0x25e1d1d2,
+    0x05800e4e,     0x05402676,     0x05001e63,     0x25a0d1c9,
+    0x2521c495,     0x0583abe2,     0x054011ab,     0x05007cbe,
+    0x2560c3b7,     0x25e1c358,     0x05806593,     0x054064b5,
+    0x05000e5a,     0x2520c3f1,     0x25a1cc29,     0x05801468,
+    0x05401d71,     0x05035bb2,     0x04bb01f0,     0x046806dc,
+    0x659c0385,     0x65d909e0,     0x65c30415,     0x04fa10ba,
+    0x04611a33,     0x042e17ce,     0x04bf1c52,     0x0456b7d7,
+    0x04400008,     0x049a1417,     0x04509b1a,     0x041b1456,
+    0x0499b58b,     0x04dab938,     0x04991691,     0x04d395a4,
+    0x04d19ff6,     0x045011f2,     0x0417be8d,     0x041eadc1,
+    0x04980987,     0x052799e4,     0x05a49c23,     0x04c817e5,
+    0x044a0d2d,     0x04c901fe,     0x044b0343,     0x04c10839,
+    0x04dcac2a,     0x65c087ba,     0x658d8791,     0x65869d61,
+    0x65c78021,     0x65828c5b,     0x049db33e,     0x65c2b862,
+    0x65c0ac7d,     0x65c1b38e,     0x65cdab64,     0x65c19022,
+    0x65fc97e7,     0x65bd162a,     0x65b82596,     0x65a0a969,
+    0x65a4d697,     0x65feec8f,     0x65ba46bb,     0x65a4633f,
+    0x04c742a6,     0x049f7f18,     0x042c3141,     0x04b9310d,
+    0x047733e1,     0x04f53014,     0x05bb6bbf,     0x05ba6fa8,
+    0x65c88645,     0x4555b34d,     0x45cab660,     0x043138c7,
+    0x44589b94,     0x445a8e71,     0x44198b1a,     0x449b8f8b,
+    0x049a3797,     0x04183f14,     0x045926fb,     0x04c825ac,
+    0x040a369a,     0x65873fa2,     0x6586347d,     0x65982b85,
+    0x04412dd1,     0x0e2c116a,     0x4e2a1128,     0x0e6b1149,
+    0x4e751293,     0x0ea21020,     0x4ebf13dd,     0x2e321230,
+    0x6e321230,     0x2e6f11cd,     0x6e791317,     0x2eba1338,
+    0x6eb91317,
   };
 // END  Generated code -- do not edit
diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java
new file mode 100644
index 0000000000000..63a35db50a5a3
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/VectorGatherSubwordTest.java
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.generators.*;
+import compiler.lib.ir_framework.*;
+import jdk.incubator.vector.*;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test
+ * @bug 8351623
+ * @summary VectorAPI: Refactor subword gather load and add SVE implementation
+ * @key randomness
+ * @library /test/lib /
+ * @modules jdk.incubator.vector
+ *
+ * @run driver compiler.vectorapi.VectorGatherSubwordTest
+ */
+public class VectorGatherSubwordTest {
+    private static final VectorSpecies<Byte> B_SPECIES = ByteVector.SPECIES_MAX;
+    private static final VectorSpecies<Short> S_SPECIES = ShortVector.SPECIES_MAX;
+
+    private static int LENGTH = 128;
+    private static final Generators random = Generators.G;
+
+    private static byte[] ba;
+    private static byte[] br;
+    private static short[] sa;
+    private static short[] sr;
+    private static boolean[] m;
+    private static int[][] indexes;
+
+    static {
+        ba = new byte[LENGTH];
+        br = new byte[LENGTH];
+        sa = new short[LENGTH];
+        sr = new short[LENGTH];
+        m = new boolean[LENGTH];
+        indexes = new int[2][];
+
+        Generator<Integer> byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE);
+        Generator<Integer> shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE);
+        for (int i = 0; i < LENGTH; i++) {
+            ba[i] = byteGen.next().byteValue();
+            sa[i] = shortGen.next().shortValue();
+            m[i] = i % 2 == 0;
+        }
+
+        int[] nums = {B_SPECIES.length(), S_SPECIES.length()};
+        for (int i = 0; i < 2; i++) {
+            indexes[i] = new int[nums[i]];
+            random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]);
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"})
+    public void testLoadGatherByte() {
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0)
+                      .intoArray(br, i);
+        }
+    }
+
+    @Check(test = "testLoadGatherByte")
+    public void verifyLoadGatherByte() {
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            for (int j = 0; j < B_SPECIES.length(); j++) {
+                Asserts.assertEquals(ba[i + indexes[0][j]], br[i + j]);
+            }
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_GATHER, " >0 "}, applyIfCPUFeature = {"sve", "true"})
+    public void testLoadGatherShort() {
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0)
+                       .intoArray(sr, i);
+        }
+    }
+
+    @Check(test = "testLoadGatherShort")
+    public void verifyLoadGatherShort() {
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            for (int j = 0; j < S_SPECIES.length(); j++) {
+                Asserts.assertEquals(sa[i + indexes[1][j]], sr[i + j]);
+            }
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"})
+    public void testLoadGatherMaskedByte() {
+        VectorMask<Byte> mask = VectorMask.fromArray(B_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            ByteVector.fromArray(B_SPECIES, ba, i, indexes[0], 0, mask)
+                      .intoArray(br, i);
+        }
+    }
+
+    @Check(test = "testLoadGatherMaskedByte")
+    public void verifyLoadGatherMaskedByte() {
+        for (int i = 0; i < LENGTH; i += B_SPECIES.length()) {
+            for (int j = 0; j < B_SPECIES.length(); j++) {
+                Asserts.assertEquals(m[j] ? ba[i + indexes[0][j]] : 0, br[i + j]);
+            }
+        }
+    }
+
+    @Test
+    @IR(counts = { IRNode.LOAD_VECTOR_GATHER_MASKED, " >0 "}, applyIfCPUFeature = {"sve", "true"})
+    public void testLoadGatherMaskedShort() {
+        VectorMask<Short> mask = VectorMask.fromArray(S_SPECIES, m, 0);
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            ShortVector.fromArray(S_SPECIES, sa, i, indexes[1], 0, mask)
+                       .intoArray(sr, i);
+        }
+    }
+
+    @Check(test = "testLoadGatherMaskedShort")
+    public void verifyLoadGatherMaskedShort() {
+        for (int i = 0; i < LENGTH; i += S_SPECIES.length()) {
+            for (int j = 0; j < S_SPECIES.length(); j++) {
+                Asserts.assertEquals(m[j] ? sa[i + indexes[1][j]] : 0, sr[i + j]);
+            }
+        }
+    }
+
+    public static void main(String[] args) {
+        TestFramework testFramework = new TestFramework();
+        testFramework.setDefaultWarmup(5000)
+                     .addFlags("--add-modules=jdk.incubator.vector")
+                     .start();
+    }
+}