diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 9886d6374665b..16bb7eb222723 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15508,6 +15508,27 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue performSetccMergeZeroCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && + "Unexpected opcode!"); + + SDValue Pred = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + ISD::CondCode Cond = cast(N->getOperand(3))->get(); + + // setcc_merge_zero pred (sign_extend (setcc_merge_zero ... pred ...)), 0, ne + // => inner setcc_merge_zero + if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) && + LHS->getOpcode() == ISD::SIGN_EXTEND && + LHS->getOperand(0)->getValueType(0) == N->getValueType(0) && + LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO && + LHS->getOperand(0)->getOperand(0) == Pred) + return LHS->getOperand(0); + + return SDValue(); +} + // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because @@ -16366,6 +16387,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performSpliceCombine(N, DAG); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); + case AArch64ISD::SETCC_MERGE_ZERO: + return performSetccMergeZeroCombine(N, DAG); case AArch64ISD::GLD1_MERGE_ZERO: case AArch64ISD::GLD1_SCALED_MERGE_ZERO: case AArch64ISD::GLD1_UXTW_MERGE_ZERO: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index ecc2ca518df1b..d8c040e3fba29 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -90,8 +90,6 @@ define <8 x float> @masked_load_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; CHECK-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; CHECK-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; CHECK-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x0] ; CHECK-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; CHECK-NEXT: ret @@ -108,8 +106,6 @@ define <16 x float> @masked_load_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -126,8 +122,6 @@ define <32 x float> @masked_load_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_1024-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_1024-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_1024-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_1024-NEXT: ret @@ -144,8 +138,6 @@ define <64 x float> @masked_load_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_2048-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_2048-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_2048-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_2048-NEXT: ret @@ -163,8 +155,6 @@ define <64 x i8> @masked_load_v64i8(<64 x i8>* %ap, <64 x i8>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].b, [[PG0]]/z, [[Z0]].b, #0 ; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1b { [[Z0]].b }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -181,8 +171,6 @@ define <32 x i16> @masked_load_v32i16(<32 x i16>* %ap, <32 x i16>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].h, [[PG0]]/z, [[Z0]].h, #0 ; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG0]], [x8] ; VBITS_GE_512: ret @@ -199,8 +187,6 @@ define <16 x i32> @masked_load_v16i32(<16 x i32>* %ap, <16 x i32>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].s, [[PG0]]/z, [[Z0]].s, #0 ; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -217,8 +203,6 @@ define <8 x i64> @masked_load_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG0]], [x8] ; VBITS_GE_512-NEXT: ret @@ -235,8 +219,6 @@ define <8 x i64> @masked_load_passthru_v8i64(<8 x i64>* %ap, <8 x i64>* %bp) #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8] @@ -254,8 +236,6 @@ define <8 x double> @masked_load_passthru_v8f64(<8 x double>* %ap, <8 x double>* ; VBITS_GE_512-NEXT: ld1d { [[Z0:z[0-9]+]].d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { [[Z1:z[0-9]+]].d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].d, [[PG0]]/z, [[Z0]].d, [[Z1]].d -; VBITS_GE_512-NEXT: mov [[Z0]].d, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG1]].d, [[PG0]]/z, [[Z0]].d, #0 ; VBITS_GE_512-NEXT: ld1d { [[Z0]].d }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: sel [[Z2:z[0-9]+]].d, [[PG1]], [[Z0]].d, [[Z1]].d ; VBITS_GE_512-NEXT: st1d { [[Z2]].d }, [[PG0]], [x8] @@ -273,12 +253,10 @@ define <32 x i16> @masked_load_sext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0 -; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].h, [[Z0]].b -; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %b = load <32 x i8>, <32 x i8>* %bp @@ -337,12 +315,10 @@ define <16 x i32> @masked_load_sext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0 -; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].s, [[Z0]].h -; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %ap %b = load <16 x i16>, <16 x i16>* %bp @@ -379,12 +355,10 @@ define <8 x i64> @masked_load_sext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0 -; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: sunpklo [[Z0]].d, [[Z0]].s -; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %ap %b = load <8 x i32>, <8 x i32>* %bp @@ -400,12 +374,10 @@ define <32 x i16> @masked_load_zext_v32i8i16(<32 x i8>* %ap, <32 x i8>* %bp) #0 ; VBITS_GE_512-NEXT: ld1b { [[Z0:z[0-9]+]].b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { [[Z1:z[0-9]+]].b }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, [[Z1]].b -; VBITS_GE_512-NEXT: mov [[Z0]].b, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].b, [[PG0]]/z, [[Z0]].b, #0 -; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1b { [[Z0]].b }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].h, vl32 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].h, [[Z0]].b -; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1h { [[Z0]].h }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <32 x i8>, <32 x i8>* %ap %b = load <32 x i8>, <32 x i8>* %bp @@ -464,12 +436,10 @@ define <16 x i32> @masked_load_zext_v16i16i32(<16 x i16>* %ap, <16 x i16>* %bp) ; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { [[Z1:z[0-9]+]].h }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, [[Z1]].h -; VBITS_GE_512-NEXT: mov [[Z0]].h, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].h, [[PG0]]/z, [[Z0]].h, #0 -; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1h { [[Z0]].h }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].s, vl16 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].s, [[Z0]].h -; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <16 x i16>, <16 x i16>* %ap %b = load <16 x i16>, <16 x i16>* %bp @@ -506,12 +476,10 @@ define <8 x i64> @masked_load_zext_v8i32i64(<8 x i32>* %ap, <8 x i32>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z0]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, #0 -; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG2]]/z, [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: ld1w { [[Z0]].s }, [[PG1]]/z, [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ptrue [[PG2:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: uunpklo [[Z0]].d, [[Z0]].s -; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG2]], [x8] +; VBITS_GE_512-NEXT: st1d { [[Z0]].d }, [[PG1]], [x8] ; VBITS_GE_512-NEXT: ret %a = load <8 x i32>, <8 x i32>* %ap %b = load <8 x i32>, <8 x i32>* %bp diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll index bbba4336e0e66..6f5c5cee303c6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -91,9 +91,7 @@ define void @masked_store_v8f32(<8 x float>* %ap, <8 x float>* %bp) #0 { ; CHECK-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; CHECK-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; CHECK-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; CHECK-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1 -; CHECK-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z2]].s, #0 -; CHECK-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; CHECK-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; CHECK-NEXT: ret %a = load <8 x float>, <8 x float>* %ap %b = load <8 x float>, <8 x float>* %bp @@ -108,9 +106,7 @@ define void @masked_store_v16f32(<16 x float>* %ap, <16 x float>* %bp) #0 { ; VBITS_GE_512-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_512-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_512-NEXT: mov [[Z2:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_512-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_512-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_512-NEXT: ret %a = load <16 x float>, <16 x float>* %ap %b = load <16 x float>, <16 x float>* %bp @@ -125,9 +121,7 @@ define void @masked_store_v32f32(<32 x float>* %ap, <32 x float>* %bp) #0 { ; VBITS_GE_1024-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_1024-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_1024-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_1024-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_1024-NEXT: ret %a = load <32 x float>, <32 x float>* %ap %b = load <32 x float>, <32 x float>* %bp @@ -142,9 +136,7 @@ define void @masked_store_v64f32(<64 x float>* %ap, <64 x float>* %bp) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[Z1:z[0-9]+]].s }, [[PG0]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[PG1:p[0-9]+]].s, [[PG0]]/z, [[Z0]].s, [[Z1]].s -; VBITS_GE_2048-NEXT: mov [[Z1:z[0-9]+]].s, [[PG1]]/z, #-1 -; VBITS_GE_2048-NEXT: cmpne [[PG2:p[0-9]+]].s, [[PG0]]/z, [[Z1]].s, #0 -; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG2]], [x{{[0-9]+}}] +; VBITS_GE_2048-NEXT: st1w { z0.s }, [[PG1]], [x{{[0-9]+}}] ; VBITS_GE_2048-NEXT: ret %a = load <64 x float>, <64 x float>* %ap %b = load <64 x float>, <64 x float>* %bp