diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d9ed4d4c039f9..968d5a995facf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14943,6 +14943,12 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
       !LD->getValueType(0).isInteger())
     return false;
 
+  // The algorithm to split up a load of a scalable vector into individual
+  // elements currently requires knowing the length of the loaded type,
+  // so will need adjusting to work on scalable vectors.
+  if (LD->getValueType(0).isScalableVector())
+    return false;
+
   // Keep track of already used bits to detect overlapping values.
   // In that case, we will just abort the transformation.
   APInt UsedBits(LD->getValueSizeInBits(0), 0);
@@ -16579,7 +16585,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       }
 
       if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
-          !ST1->getBasePtr().isUndef()) {
+          !ST1->getBasePtr().isUndef() &&
+          // BaseIndexOffset and the code below requires knowing the size
+          // of a vector, so bail out if MemoryVT is scalable.
+          !ST1->getMemoryVT().isScalableVector()) {
         const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
         const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
         unsigned STBitSize = ST->getMemoryVT().getSizeInBits();
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 86ce09152417c..59dd4905de5a5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3331,9 +3331,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention.");
-  case CallingConv::AArch64_SVE_VectorCall:
-    // Calling SVE functions is currently not yet supported.
-    report_fatal_error("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:
@@ -3356,6 +3353,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    case CallingConv::CFGuard_Check:
      return CC_AArch64_Win64_CFGuard_Check;
    case CallingConv::AArch64_VectorCall:
+   case CallingConv::AArch64_SVE_VectorCall:
      return CC_AArch64_AAPCS;
   }
 }
@@ -3474,7 +3472,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
@@ -3491,7 +3489,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
+                              ? VA.getLocVT().getSizeInBits()
+                              : VA.getValVT().getSizeInBits()) / 8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -3517,7 +3517,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        MemVT = VA.getLocVT();
+        break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
@@ -3535,6 +3536,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           MemVT);
 
     }
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      assert(VA.getValVT().isScalableVector() &&
+           "Only scalable vectors can be passed indirectly");
+      // If value is passed via pointer - do a load.
+      ArgValue =
+          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+    }
+
     if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
       ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
                              ArgValue, DAG.getValueType(MVT::i32));
@@ -3895,6 +3905,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
+  // If any of the arguments is passed indirectly, it must be SVE, so the
+  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
+  // allocate space on the stack. That is why we determine this explicitly here
+  // the call cannot be a tailcall.
+  if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
+        assert((A.getLocInfo() != CCValAssign::Indirect ||
+                A.getValVT().isScalableVector()) &&
+               "Expected value to be scalable");
+        return A.getLocInfo() == CCValAssign::Indirect;
+      }))
+    return false;
+
   // If the stack arguments for this call do not fit into our own save area then
   // the call cannot be made tail.
   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
@@ -4135,7 +4157,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
-      llvm_unreachable("Spilling of SVE vectors not yet implemented");
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
+      unsigned Align = DAG.getDataLayout().getPrefTypeAlignment(Ty);
+      int FI = MFI.CreateStackObject(
+          VA.getValVT().getStoreSize().getKnownMinSize(), Align, false);
+      MFI.setStackID(FI, TargetStackID::SVEVector);
+
+      SDValue SpillSlot = DAG.getFrameIndex(
+          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+      Chain = DAG.getStore(
+          Chain, DL, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      Arg = SpillSlot;
+      break;
     }
 
     if (VA.isRegLoc()) {
@@ -4183,8 +4218,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       // FIXME: This works on big-endian for composite byvals, which are the
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
-      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getValVT().getSizeInBits();
+      unsigned OpSize;
+      if (VA.getLocInfo() == CCValAssign::Indirect)
+        OpSize = VA.getLocVT().getSizeInBits();
+      else
+        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                 : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
           !Flags.isInConsecutiveRegs()) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index f2d0d963d621f..e188fa4e2fce5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1288,6 +1288,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_store<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
     def _fi : Pat<(store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
                        (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+
+    def _default : Pat<(store (Ty ZPR:$val), GPR64:$base),
+                       (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
   defm Pat_ST1B        : unpred_store<nxv16i8, ST1B_IMM, PTRUE_B>;
@@ -1301,6 +1304,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_load<ValueType Ty, Instruction RegImmInst, Instruction PTrue> {
     def _fi : Pat<(Ty (load  (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
                   (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+
+    def _default : Pat<(Ty (load GPR64:$base)),
+                       (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
   }
 
   defm Pat_LD1B        : unpred_load<nxv16i8, LD1B_IMM, PTRUE_B>;
@@ -1314,6 +1320,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
     def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
                   (Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
+                  (Store PPR:$Val, GPR64:$base, (i64 0))>;
   }
 
   defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
@@ -1324,6 +1333,9 @@ let Predicates = [HasSVE] in {
   multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
     def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
                   (Load GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(Ty (load GPR64:$base)),
+                  (Load GPR64:$base, (i64 0))>;
   }
 
   defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
diff --git a/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll b/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
new file mode 100644
index 0000000000000..ca29e15697fe0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-callbyref-notailcall.ll
@@ -0,0 +1,29 @@
+; Because some arguments are passed by reference (through stack),
+; the compiler should not do tail-call optimization.
+; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s
+
+; CHECK-LABEL: caller:
+; CHECK:       addvl sp, sp, #-[[STACKSIZE:[0-9]+]]
+; CHECK-NOT:   addvl sp
+; CHECK:       bl callee
+; CHECK:       addvl sp, sp, #[[STACKSIZE]]
+; CHECK:       ret
+define <vscale x 16 x i8> @caller(<vscale x 16 x i8> %v) {
+  %1 = tail call <vscale x 16 x i8> @callee(<vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v, <vscale x 16 x i8> %v)
+  ret <vscale x 16 x i8> %1
+}
+
+declare <vscale x 16 x i8> @callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+
+; CHECK-LABEL: caller_pred:
+; CHECK:       addvl sp, sp, #-[[STACKSIZE:[0-9]+]]
+; CHECK-NOT:   addvl sp
+; CHECK:       bl callee_pred
+; CHECK:       addvl sp, sp, #[[STACKSIZE]]
+; CHECK:       ret
+define <vscale x 16 x i1> @caller_pred(<vscale x 16 x i1> %v) {
+  %1 = tail call <vscale x 16 x i1> @callee_pred(<vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v, <vscale x 16 x i1> %v)
+  ret <vscale x 16 x i1> %1
+}
+
+declare <vscale x 16 x i1> @callee_pred(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
new file mode 100644
index 0000000000000..bbb8209941b0e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -0,0 +1,118 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -stop-after=finalize-isel < %s | FileCheck %s
+
+; Test that z8 and z9, passed in by reference, are correctly loaded from x0 and x1.
+; i.e. z0 =  %z0
+;         :
+;      z7 =  %z7
+;      x0 = &%z8
+;      x1 = &%z9
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_sve_arg(<vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5, <vscale x 4 x i32> %z6, <vscale x 4 x i32> %z7, <vscale x 4 x i32> %z8, <vscale x 4 x i32> %z9) {
+; CHECK: name: callee_with_many_sve_arg
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1
+; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], [[BASE]]
+; CHECK-DAG: $z0 = COPY [[RES]]
+; CHECK:     RET_ReallyLR implicit $z0
+  ret <vscale x 4 x i32> %z9
+}
+
+; Test that z8 and z9 are passed by reference.
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @caller_with_many_sve_arg(<vscale x 4 x i32> %z) {
+; CHECK: name: caller_with_many_sve_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG:  [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG:  ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.1, 0
+; CHECK-DAG:  ST1W_IMM %{{[0-9]+}}, [[PTRUE]], %stack.0, 0
+; CHECK-DAG:  [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0
+; CHECK-DAG:  [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
+; CHECK-DAG:  $x0 = COPY [[BASE1]]
+; CHECK-DAG:  $x1 = COPY [[BASE2]]
+; CHECK-NEXT: BL @callee_with_many_sve_arg
+; CHECK:      RET_ReallyLR implicit $z0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_sve_arg(<vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z)
+  ret <vscale x 4 x i32> %ret
+}
+
+; Test that p4 and p5, passed in by reference, are correctly loaded from register x0 and x1.
+; i.e. p0 =  %p0
+;         :
+;      p3 =  %p3
+;      x0 = &%p4
+;      x1 = &%p5
+define aarch64_sve_vector_pcs <vscale x 4 x i1> @callee_with_many_svepred_arg(<vscale x 4 x i1> %p0, <vscale x 4 x i1> %p1, <vscale x 4 x i1> %p2, <vscale x 4 x i1> %p3, <vscale x 4 x i1> %p4, <vscale x 4 x i1> %p5) {
+; CHECK: name: callee_with_many_svepred_arg
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = COPY $x1
+; CHECK-DAG: [[RES:%[0-9]+]]:ppr = LDR_PXI [[BASE]], 0
+; CHECK-DAG: $p0 = COPY [[RES]]
+; CHECK:     RET_ReallyLR implicit $p0
+  ret <vscale x 4 x i1> %p5
+}
+
+; Test that p4 and p5 are passed by reference.
+define aarch64_sve_vector_pcs <vscale x 4 x i1> @caller_with_many_svepred_arg(<vscale x 4 x i1> %p) {
+; CHECK: name: caller_with_many_svepred_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 1, alignment: 4,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 1, alignment: 4,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
+; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
+; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
+; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64sp = ADDXri %stack.1, 0
+; CHECK-DAG: $x0 = COPY [[BASE1]]
+; CHECK-DAG: $x1 = COPY [[BASE2]]
+; CHECK-NEXT: BL @callee_with_many_svepred_arg
+; CHECK:     RET_ReallyLR implicit $p0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i1> @callee_with_many_svepred_arg(<vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p, <vscale x 4 x i1> %p)
+  ret <vscale x 4 x i1> %ret
+}
+
+; Test that z8 and z9, passed by reference, are loaded from a location that is passed on the stack.
+; i.e.     x0 =   %x0
+;             :
+;          x7 =   %x7
+;          z0 =   %z0
+;             :
+;          z7 =   %z7
+;        [sp] =  &%z8
+;      [sp+8] =  &%z9
+;
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_gpr_sve_arg(i64 %x0, i64 %x1, i64 %x2, i64 %x3, i64 %x4, i64 %x5, i64 %x6, i64 %x7, <vscale x 4 x i32> %z0, <vscale x 4 x i32> %z1, <vscale x 4 x i32> %z2, <vscale x 4 x i32> %z3, <vscale x 4 x i32> %z4, <vscale x 4 x i32> %z5, <vscale x 4 x i32> %z6, <vscale x 4 x i32> %z7, <vscale x 2 x i64> %z8, <vscale x 4 x i32> %z9) {
+; CHECK: name: callee_with_many_gpr_sve_arg
+; CHECK: fixedStack:
+; CHECK:      - { id: 0, type: default, offset: 8, size: 8, alignment: 8, stack-id: default,
+; CHECK-DAG: [[BASE:%[0-9]+]]:gpr64common = LDRXui %fixed-stack.0, 0
+; CHECK-DAG: [[PTRUE:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[RES:%[0-9]+]]:zpr = LD1W_IMM killed [[PTRUE]], killed [[BASE]]
+; CHECK-DAG: $z0 = COPY [[RES]]
+; CHECK: RET_ReallyLR implicit $z0
+  ret <vscale x 4 x i32> %z9
+}
+
+; Test that z8 and z9 are passed by reference, where reference is passed on the stack.
+define aarch64_sve_vector_pcs <vscale x 4 x i32> @caller_with_many_gpr_sve_arg(i64 %x, <vscale x 4 x i32> %z, <vscale x 2 x i64> %z2) {
+; CHECK: name: caller_with_many_gpr_sve_arg
+; CHECK: stack:
+; CHECK:      - { id: 0, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK:      - { id: 1, name: '', type: default, offset: 0, size: 16, alignment: 16,
+; CHECK-NEXT:     stack-id: sve-vec
+; CHECK-DAG: [[PTRUE_S:%[0-9]+]]:ppr_3b = PTRUE_S 31
+; CHECK-DAG: [[PTRUE_D:%[0-9]+]]:ppr_3b = PTRUE_D 31
+; CHECK-DAG: ST1D_IMM %{{[0-9]+}}, killed [[PTRUE_D]], %stack.0, 0
+; CHECK-DAG: ST1W_IMM %{{[0-9]+}}, killed [[PTRUE_S]], %stack.1, 0
+; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0
+; CHECK-DAG: [[BASE2:%[0-9]+]]:gpr64common = ADDXri %stack.1, 0
+; CHECK-DAG: [[SP:%[0-9]+]]:gpr64sp = COPY $sp
+; CHECK-DAG: STRXui killed [[BASE1]], [[SP]], 0
+; CHECK-DAG: STRXui killed [[BASE2]], [[SP]], 1
+; CHECK:     BL @callee_with_many_gpr_sve_arg
+; CHECK:     RET_ReallyLR implicit $z0
+  %ret = call aarch64_sve_vector_pcs <vscale x 4 x i32> @callee_with_many_gpr_sve_arg(i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, i64 %x, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 4 x i32> %z, <vscale x 2 x i64> %z2, <vscale x 4 x i32> %z)
+  ret <vscale x 4 x i32> %ret
+}