openjdk · mikabl-arm · Oct 18, 2024 · Jan 30, 2025 · Jan 30, 2025 · Feb 20, 2025
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -127,6 +127,17 @@ source %{
   }
 
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
+    // Do not auto-vectorize these FP operations, neither NEON or SVE/SVE2 support them directly:
+    //   1. The non_strict_order SVE implementation for 256-bit wide vectors does recursive folding
+    //      and doesn't conform to the JLS, Section Evaluation Order.
+    //   2. A strictly ordered SVE implementation for 256-bit wide vectors isn't currently
+    //      profitable performance-wise.
+    //   3. The strictly ordered NEON implementation for 64-bit and 128-bit wide vectors isn't
+    //      profitable performance-wise.
+    if (opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+      return false;
+    }
+
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
       // NEON instructions support them. But the match rule support for them is profitable for
@@ -139,7 +150,6 @@ source %{
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
         return false;
       }
@@ -205,9 +215,9 @@ source %{
       case Op_MulReductionVF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
-        // No vector multiply reduction instructions, but we do
-        // emit scalar instructions for 64/128-bit vectors.
-        if (length_in_bytes != 8 && length_in_bytes != 16) {
+        // No vector multiply reduction instructions, but we do emit ASIMD instructions for
+        // 64/128-bit vectors. For 256-bit vectors it's a combination of SVE and ASIMD instructions.
+        if (length_in_bytes < 8 || length_in_bytes > 32) {
           return false;
         }
         break;
@@ -3482,56 +3492,122 @@ instruct reduce_addD_masked(vRegD dst_src1, vReg src2, pRegGov pg) %{
 
 // ------------------------------ Vector reduction mul -------------------------
 
-instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
-                     vReg tmp1, vReg tmp2) %{
+instruct reduce_mulI_le128b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                            vReg tmp1, vReg tmp2) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 8 ||
             Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVI isrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
-  format %{ "reduce_mulI $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %}
+  format %{ "reduce_mulI_le128b $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    __ reduce_mul_integral_le128b($dst$$Register, bt, $isrc$$Register,
+                                  $vsrc$$FloatRegister, length_in_bytes,
+                                  $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulI_256b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                          vReg tmp1, vReg tmp2, vReg tmp3) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32);
+  match(Set dst (MulReductionVI isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "reduce_mulI_256b $dst, $isrc, $vsrc\t# vector (256 bits). KILL $tmp1, $tmp2, $tmp3" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_integral($dst$$Register, bt, $isrc$$Register,
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_mul_integral_256b($dst$$Register, bt, $isrc$$Register,
                                 $vsrc$$FloatRegister, length_in_bytes,
-                                $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+                                $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
+instruct reduce_mulL_128b(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVL isrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_mulL $dst, $isrc, $vsrc\t# 2L" %}
+  format %{ "reduce_mulL_128b $dst, $isrc, $vsrc\t# 2L" %}
   ins_encode %{
-    __ neon_reduce_mul_integral($dst$$Register, T_LONG, $isrc$$Register,
-                                $vsrc$$FloatRegister, 16, fnoreg, fnoreg);
+    __ reduce_mul_integral_le128b($dst$$Register, T_LONG, $isrc$$Register, $vsrc$$FloatRegister, 16,
+                                  fnoreg, fnoreg);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+instruct reduce_mulL_256b(iRegLNoSp dst, iRegL isrc, vReg vsrc, vReg tmp1) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32);
+  match(Set dst (MulReductionVL isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1);
+  format %{ "reduce_mulL_256b $dst, $isrc, $vsrc\t# 4L. KILL $tmp1" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_mul_integral_256b($dst$$Register, T_LONG, $isrc$$Register,
+                                $vsrc$$FloatRegister, length_in_bytes,
+                                $tmp1$$FloatRegister, fnoreg, fnoreg);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulF_le128b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
   match(Set dst (MulReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
+  format %{ "reduce_mulF_le128b $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+    __ reduce_mul_fp_le128b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_non_strict_order_mulF_256b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
+  match(Set dst (MulReductionVF fsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_non_strict_order_mulF_256b $dst, $fsrc, $vsrc\t# 8F. KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
+                                           $vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister,
+                                           $tmp2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
+instruct reduce_mulD_128b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVD dsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
+  format %{ "reduce_mulD_128b $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
   ins_encode %{
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
+    __ reduce_mul_fp_le128b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_non_strict_order_mulD_256b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
+  match(Set dst (MulReductionVD dsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_non_strict_order_mulD_256b $dst, $dsrc, $vsrc\t# 4D. KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
+                                           $vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister,
+                                           $tmp2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -117,6 +117,17 @@ source %{
   }
 
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
+    // Do not auto-vectorize these FP operations, neither NEON or SVE/SVE2 support them directly:
+    //   1. The non_strict_order SVE implementation for 256-bit wide vectors does recursive folding
+    //      and doesn't conform to the JLS, Section Evaluation Order.
+    //   2. A strictly ordered SVE implementation for 256-bit wide vectors isn't currently
+    //      profitable performance-wise.
+    //   3. The strictly ordered NEON implementation for 64-bit and 128-bit wide vectors isn't
+    //      profitable performance-wise.
+    if (opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
+      return false;
+    }
+
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
       // NEON instructions support them. But the match rule support for them is profitable for
@@ -129,7 +140,6 @@ source %{
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
           opcode == Op_MulVL) {
         return false;
       }
@@ -195,9 +205,9 @@ source %{
       case Op_MulReductionVF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
-        // No vector multiply reduction instructions, but we do
-        // emit scalar instructions for 64/128-bit vectors.
-        if (length_in_bytes != 8 && length_in_bytes != 16) {
+        // No vector multiply reduction instructions, but we do emit ASIMD instructions for
+        // 64/128-bit vectors. For 256-bit vectors it's a combination of SVE and ASIMD instructions.
+        if (length_in_bytes < 8 || length_in_bytes > 32) {
           return false;
         }
         break;
@@ -2109,56 +2119,122 @@ REDUCE_ADD_FP_PREDICATE(D, D)
 
 // ------------------------------ Vector reduction mul -------------------------
 
-instruct reduce_mulI(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
-                     vReg tmp1, vReg tmp2) %{
+instruct reduce_mulI_le128b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                            vReg tmp1, vReg tmp2) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 8 ||
             Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVI isrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
-  format %{ "reduce_mulI $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %}
+  format %{ "reduce_mulI_le128b $dst, $isrc, $vsrc\t# vector (64/128 bits). KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    __ reduce_mul_integral_le128b($dst$$Register, bt, $isrc$$Register,
+                                  $vsrc$$FloatRegister, length_in_bytes,
+                                  $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulI_256b(iRegINoSp dst, iRegIorL2I isrc, vReg vsrc,
+                          vReg tmp1, vReg tmp2, vReg tmp3) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32);
+  match(Set dst (MulReductionVI isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2, TEMP tmp3);
+  format %{ "reduce_mulI_256b $dst, $isrc, $vsrc\t# vector (256 bits). KILL $tmp1, $tmp2, $tmp3" %}
   ins_encode %{
+    assert(UseSVE > 0, "must be sve");
     BasicType bt = Matcher::vector_element_basic_type(this, $vsrc);
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_integral($dst$$Register, bt, $isrc$$Register,
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_mul_integral_256b($dst$$Register, bt, $isrc$$Register,
                                 $vsrc$$FloatRegister, length_in_bytes,
-                                $tmp1$$FloatRegister, $tmp2$$FloatRegister);
+                                $tmp1$$FloatRegister, $tmp2$$FloatRegister, $tmp3$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulL(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
+instruct reduce_mulL_128b(iRegLNoSp dst, iRegL isrc, vReg vsrc) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVL isrc vsrc));
   effect(TEMP_DEF dst);
-  format %{ "reduce_mulL $dst, $isrc, $vsrc\t# 2L" %}
+  format %{ "reduce_mulL_128b $dst, $isrc, $vsrc\t# 2L" %}
   ins_encode %{
-    __ neon_reduce_mul_integral($dst$$Register, T_LONG, $isrc$$Register,
-                                $vsrc$$FloatRegister, 16, fnoreg, fnoreg);
+    __ reduce_mul_integral_le128b($dst$$Register, T_LONG, $isrc$$Register, $vsrc$$FloatRegister, 16,
+                                  fnoreg, fnoreg);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulF(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
+instruct reduce_mulL_256b(iRegLNoSp dst, iRegL isrc, vReg vsrc, vReg tmp1) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32);
+  match(Set dst (MulReductionVL isrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1);
+  format %{ "reduce_mulL_256b $dst, $isrc, $vsrc\t# 4L. KILL $tmp1" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_mul_integral_256b($dst$$Register, T_LONG, $isrc$$Register,
+                                $vsrc$$FloatRegister, length_in_bytes,
+                                $tmp1$$FloatRegister, fnoreg, fnoreg);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_mulF_le128b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) <= 16);
   match(Set dst (MulReductionVF fsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulF $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
+  format %{ "reduce_mulF_le128b $dst, $fsrc, $vsrc\t# 2F/4F. KILL $tmp" %}
   ins_encode %{
     uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+    __ reduce_mul_fp_le128b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, length_in_bytes, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_non_strict_order_mulF_256b(vRegF dst, vRegF fsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
-  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 &&
+            !n->as_Reduction()->requires_strict_order());
-  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 &&
+            !n->as_Reduction()->requires_strict_order());
+  match(Set dst (MulReductionVF fsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_non_strict_order_mulF_256b $dst, $fsrc, $vsrc\t# 8F. KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_FLOAT, $fsrc$$FloatRegister,
+                                           $vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister,
+                                           $tmp2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}
 
-instruct reduce_mulD(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
+instruct reduce_mulD_128b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp) %{
   predicate(Matcher::vector_length_in_bytes(n->in(2)) == 16);
   match(Set dst (MulReductionVD dsrc vsrc));
   effect(TEMP_DEF dst, TEMP tmp);
-  format %{ "reduce_mulD $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
+  format %{ "reduce_mulD_128b $dst, $dsrc, $vsrc\t# 2D. KILL $tmp" %}
   ins_encode %{
-    __ neon_reduce_mul_fp($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
-                          $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
+    __ reduce_mul_fp_le128b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
+                            $vsrc$$FloatRegister, 16, $tmp$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct reduce_non_strict_order_mulD_256b(vRegD dst, vRegD dsrc, vReg vsrc, vReg tmp1, vReg tmp2) %{
+  predicate(Matcher::vector_length_in_bytes(n->in(2)) == 32 && !n->as_Reduction()->requires_strict_order());
+  match(Set dst (MulReductionVD dsrc vsrc));
+  effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
+  format %{ "reduce_non_strict_order_mulD_256b $dst, $dsrc, $vsrc\t# 4D. KILL $tmp1, $tmp2" %}
+  ins_encode %{
+    assert(UseSVE > 0, "must be sve");
+    uint length_in_bytes = Matcher::vector_length_in_bytes(this, $vsrc);
+    assert(length_in_bytes == MaxVectorSize, "invalid vector length");
+    __ reduce_non_strict_order_mul_fp_256b($dst$$FloatRegister, T_DOUBLE, $dsrc$$FloatRegister,
+                                           $vsrc$$FloatRegister, length_in_bytes, $tmp1$$FloatRegister,
+                                           $tmp2$$FloatRegister);
   %}
   ins_pipe(pipe_slow);
 %}

diff --git a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp
@@ -4064,6 +4064,15 @@ template<typename R, typename... Rx>
   INSN(sve_brkb, 0b10); // Break before first true condition
 #undef INSN
 
+// SVE Integer Misc - Unpredicated
+
+  // SVE constructive prefix (unpredicated)
+  void sve_movprfx(FloatRegister Zd, FloatRegister Zn) {
+    starti;
+    f(0b00000100, 31, 24), f(0b00, 23, 22), f(0b1, 21), f(0b00000, 20, 16);
+    f(0b101111, 15, 10), rf(Zn, 5), rf(Zd, 0);
+  }
+
 // Element count and increment scalar (SVE)
 #define INSN(NAME, TYPE)                                                             \
   void NAME(Register Xdn, unsigned imm4 = 1, int pattern = 0b11111) {                \