diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9886d6374665b..16bb7eb222723 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15508,6 +15508,27 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+         "Unexpected opcode!");
+
+  SDValue Pred = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
+
+  // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne
+  //    => inner setcc_merge_zero
+  if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
+      LHS->getOpcode() == ISD::SIGN_EXTEND &&
+      LHS->getOperand(0)->getValueType(0) == N->getValueType(0) &&
+      LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
+      LHS->getOperand(0)->getOperand(0) == Pred)
+    return LHS->getOperand(0);
+
+  return SDValue();
+}
+
 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
 // as well as whether the test should be inverted.  This code is required to
 // catch these cases (as opposed to standard dag combines) because
@@ -16366,6 +16387,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performSpliceCombine(N, DAG);
   case AArch64ISD::UZP1:
     return performUzpCombine(N, DAG);
+  case AArch64ISD::SETCC_MERGE_ZERO:
+    return performSetccMergeZeroCombine(N, DAG);
   case AArch64ISD::GLD1_MERGE_ZERO:
   case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
   case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index ecc2ca518df1b..d8c040e3fba29 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -90,8 +90,6 @@ define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
 ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; CHECK-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; CHECK-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
 ; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0]
 ; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
 ; CHECK-NEXT: ret
@@ -108,8 +106,6 @@ define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
 ; VBITS_GE_512-NEXT: ret
@@ -126,8 +122,6 @@ define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0
 ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_1024-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_1024-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
 ; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
 ; VBITS_GE_1024-NEXT: ret
@@ -144,8 +138,6 @@ define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0
 ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_2048-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_2048-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
 ; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
 ; VBITS_GE_2048-NEXT: ret
@@ -163,8 +155,6 @@ define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].b, [[PG0]]/z, [[Z0]].b, #0
 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8]
 ; VBITS_GE_512-NEXT: ret
@@ -181,8 +171,6 @@ define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].h, [[PG0]]/z, [[Z0]].h, #0
 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8]
 ; VBITS_GE_512: ret
@@ -199,8 +187,6 @@ define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0
 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8]
 ; VBITS_GE_512-NEXT: ret
@@ -217,8 +203,6 @@ define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8]
 ; VBITS_GE_512-NEXT: ret
@@ -235,8 +219,6 @@ define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
 ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
@@ -254,8 +236,6 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>*
 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d
-; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0
 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d
 ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8]
@@ -273,12 +253,10 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
-; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <32 x i8>, <32 x i8>* %ap
   %b = load <32 x i8>, <32 x i8>* %bp
@@ -337,12 +315,10 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
-; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h
-; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <16 x i16>, <16 x i16>* %ap
   %b = load <16 x i16>, <16 x i16>* %bp
@@ -379,12 +355,10 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
-; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s
-; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <8 x i32>, <8 x i32>* %ap
   %b = load <8 x i32>, <8 x i32>* %bp
@@ -400,12 +374,10 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0
 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b
-; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0
-; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32
 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b
-; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <32 x i8>, <32 x i8>* %ap
   %b = load <32 x i8>, <32 x i8>* %bp
@@ -464,12 +436,10 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp)
 ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h
-; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0
-; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16
 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h
-; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <16 x i16>, <16 x i16>* %ap
   %b = load <16 x i16>, <16 x i16>* %bp
@@ -506,12 +476,10 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0
-; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8
 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s
-; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8]
+; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8]
 ; VBITS_GE_512-NEXT: ret
   %a = load <8 x i32>, <8 x i32>* %ap
   %b = load <8 x i32>, <8 x i32>* %bp
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index bbba4336e0e66..6f5c5cee303c6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -91,9 +91,7 @@ define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 {
 ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
 ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; CHECK-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
-; CHECK-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z2]].s, #0
-; CHECK-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; CHECK-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
   %a = load <8 x float>, <8 x float>* %ap
   %b = load <8 x float>, <8 x float>* %bp
@@ -108,9 +106,7 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 {
 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
 ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_512-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
 ; VBITS_GE_512-NEXT: ret
   %a = load <16 x float>, <16 x float>* %ap
   %b = load <16 x float>, <16 x float>* %bp
@@ -125,9 +121,7 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 {
 ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
 ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_1024-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_1024-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
 ; VBITS_GE_1024-NEXT: ret
   %a = load <32 x float>, <32 x float>* %ap
   %b = load <32 x float>, <32 x float>* %bp
@@ -142,9 +136,7 @@ define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 {
 ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0]
 ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1]
 ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s
-; VBITS_GE_2048-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1
-; VBITS_GE_2048-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0
-; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}]
+; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}]
 ; VBITS_GE_2048-NEXT: ret
   %a = load <64 x float>, <64 x float>* %ap
   %b = load <64 x float>, <64 x float>* %bp