diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 02918e19476d3..0ca63fa92f0cf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1996,7 +1996,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ) MAKE_CASE(AArch64ISD::ABDS_PRED) MAKE_CASE(AArch64ISD::ABDU_PRED) - MAKE_CASE(AArch64ISD::ADD_PRED) MAKE_CASE(AArch64ISD::MUL_PRED) MAKE_CASE(AArch64ISD::MULHS_PRED) MAKE_CASE(AArch64ISD::MULHU_PRED) @@ -2006,7 +2005,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::SMIN_PRED) MAKE_CASE(AArch64ISD::SRA_PRED) MAKE_CASE(AArch64ISD::SRL_PRED) - MAKE_CASE(AArch64ISD::SUB_PRED) MAKE_CASE(AArch64ISD::UDIV_PRED) MAKE_CASE(AArch64ISD::UMAX_PRED) MAKE_CASE(AArch64ISD::UMIN_PRED) @@ -5240,11 +5238,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFixedLengthVectorLoadToSVE(Op, DAG); return LowerLOAD(Op, DAG); case ISD::ADD: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED); case ISD::AND: - return LowerToScalableOp(Op, DAG); case ISD::SUB: - return LowerToPredicatedOp(Op, DAG, AArch64ISD::SUB_PRED); + return LowerToScalableOp(Op, DAG); case ISD::FMAXIMUM: return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED); case ISD::FMAXNUM: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index db771070e71d3..7dfea1fbd216a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -79,7 +79,6 @@ enum NodeType : unsigned { // Predicated instructions where inactive lanes produce undefined results. ABDS_PRED, ABDU_PRED, - ADD_PRED, FADD_PRED, FDIV_PRED, FMA_PRED, @@ -98,7 +97,6 @@ enum NodeType : unsigned { SMIN_PRED, SRA_PRED, SRL_PRED, - SUB_PRED, UDIV_PRED, UMAX_PRED, UMIN_PRED, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 06486a66de20b..2901527a07d36 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -175,7 +175,6 @@ def SDT_AArch64FMA : SDTypeProfile<1, 4, [ ]>; // Predicated operations with the result of inactive lanes being unspecified. -def AArch64add_p : SDNode<"AArch64ISD::ADD_PRED", SDT_AArch64Arith>; def AArch64asr_p : SDNode<"AArch64ISD::SRA_PRED", SDT_AArch64Arith>; def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>; def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>; @@ -194,7 +193,6 @@ def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>; def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>; def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>; def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>; -def AArch64sub_p : SDNode<"AArch64ISD::SUB_PRED", SDT_AArch64Arith>; def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>; def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>; def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>; @@ -328,9 +326,6 @@ let Predicates = [HasSVEorStreamingSVE] in { defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>; defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">; defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>; - - defm ADD_ZPZZ : sve_int_bin_pred_bhsd; - defm SUB_ZPZZ : sve_int_bin_pred_bhsd; } // End HasSVEorStreamingSVE let Predicates = [HasSVEorStreamingSVE, UseExperimentalZeroingPseudos] in { diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll index d54c3a969a27b..1d94566f1a8a1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -49,7 +49,7 @@ define void @add_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i8>, <32 x i8>* %a @@ -64,12 +64,12 @@ define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] ; VBITS_LE_256-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] ; VBITS_LE_256-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] -; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b +; VBITS_LE_256-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b ; VBITS_LE_256-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] ; CHECK: ret %op1 = load <64 x i8>, <64 x i8>* %a @@ -84,22 +84,22 @@ define void @add_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] ; VBITS_LE_512-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] ; VBITS_LE_512-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] ; VBITS_LE_512-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] -; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b +; VBITS_LE_512-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b ; VBITS_LE_512-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] ; VBITS_LE_256-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]] -; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b +; VBITS_LE_256-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b ; VBITS_LE_256-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]] ; VBITS_LE_256-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]] -; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b +; VBITS_LE_256-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b ; VBITS_LE_256-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]] ; CHECK: ret %op1 = load <128 x i8>, <128 x i8>* %a @@ -114,42 +114,42 @@ define void @add_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; CHECK-DAG: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK-DAG: add [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK-DAG: add [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK-DAG: st1b { [[RES]].b }, [[PG]], [x0] ; VBITS_LE_1024-DAG: mov w[[OFF_1:[0-9]+]], #[[#VBYTES]] ; VBITS_LE_1024-DAG: ld1b { [[OP1_1:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_1]]] ; VBITS_LE_1024-DAG: ld1b { [[OP2_1:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_1]]] -; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[PG]]/m, [[OP1_1]].b, [[OP2_1]].b +; VBITS_LE_1024-DAG: add [[RES_1:z[0-9]+]].b, [[OP1_1]].b, [[OP2_1]].b ; VBITS_LE_1024-DAG: st1b { [[RES_1]].b }, [[PG]], [x0, x[[OFF_1]]] ; VBITS_LE_512-DAG: mov w[[OFF_2:[0-9]+]], #[[#mul(VBYTES,2)]] ; VBITS_LE_512-DAG: ld1b { [[OP1_2:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_2]]] ; VBITS_LE_512-DAG: ld1b { [[OP2_2:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_2]]] -; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[PG]]/m, [[OP1_2]].b, [[OP2_2]].b +; VBITS_LE_512-DAG: add [[RES_2:z[0-9]+]].b, [[OP1_2]].b, [[OP2_2]].b ; VBITS_LE_512-DAG: st1b { [[RES_2]].b }, [[PG]], [x0, x[[OFF_2]]] ; VBITS_LE_512-DAG: mov w[[OFF_3:[0-9]+]], #[[#mul(VBYTES,3)]] ; VBITS_LE_512-DAG: ld1b { [[OP1_3:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_3]]] ; VBITS_LE_512-DAG: ld1b { [[OP2_3:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_3]]] -; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[PG]]/m, [[OP1_3]].b, [[OP2_3]].b +; VBITS_LE_512-DAG: add [[RES_3:z[0-9]+]].b, [[OP1_3]].b, [[OP2_3]].b ; VBITS_LE_512-DAG: st1b { [[RES_3]].b }, [[PG]], [x0, x[[OFF_3]]] ; VBITS_LE_256-DAG: mov w[[OFF_4:[0-9]+]], #[[#mul(VBYTES,4)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_4:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_4]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_4:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_4]]] -; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[PG]]/m, [[OP1_4]].b, [[OP2_4]].b +; VBITS_LE_256-DAG: add [[RES_4:z[0-9]+]].b, [[OP1_4]].b, [[OP2_4]].b ; VBITS_LE_256-DAG: st1b { [[RES_4]].b }, [[PG]], [x0, x[[OFF_4]]] ; VBITS_LE_256-DAG: mov w[[OFF_5:[0-9]+]], #[[#mul(VBYTES,5)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_5:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_5]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_5:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_5]]] -; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[PG]]/m, [[OP1_5]].b, [[OP2_5]].b +; VBITS_LE_256-DAG: add [[RES_5:z[0-9]+]].b, [[OP1_5]].b, [[OP2_5]].b ; VBITS_LE_256-DAG: st1b { [[RES_5]].b }, [[PG]], [x0, x[[OFF_5]]] ; VBITS_LE_256-DAG: mov w[[OFF_6:[0-9]+]], #[[#mul(VBYTES,6)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_6:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_6]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_6:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_6]]] -; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[PG]]/m, [[OP1_6]].b, [[OP2_6]].b +; VBITS_LE_256-DAG: add [[RES_6:z[0-9]+]].b, [[OP1_6]].b, [[OP2_6]].b ; VBITS_LE_256-DAG: st1b { [[RES_6]].b }, [[PG]], [x0, x[[OFF_6]]] ; VBITS_LE_256-DAG: mov w[[OFF_7:[0-9]+]], #[[#mul(VBYTES,7)]] ; VBITS_LE_256-DAG: ld1b { [[OP1_7:z[0-9]+]].b }, [[PG]]/z, [x0, x[[OFF_7]]] ; VBITS_LE_256-DAG: ld1b { [[OP2_7:z[0-9]+]].b }, [[PG]]/z, [x1, x[[OFF_7]]] -; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[PG]]/m, [[OP1_7]].b, [[OP2_7]].b +; VBITS_LE_256-DAG: add [[RES_7:z[0-9]+]].b, [[OP1_7]].b, [[OP2_7]].b ; VBITS_LE_256-DAG: st1b { [[RES_7]].b }, [[PG]], [x0, x[[OFF_7]]] ; CHECK: ret %op1 = load <256 x i8>, <256 x i8>* %a @@ -182,7 +182,7 @@ define void @add_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -199,7 +199,7 @@ define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i16>, <32 x i16>* %a @@ -216,7 +216,7 @@ define void @add_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x i16>, <64 x i16>* %a @@ -233,7 +233,7 @@ define void @add_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: add [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <128 x i16>, <128 x i16>* %a @@ -266,7 +266,7 @@ define void @add_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x i32>, <8 x i32>* %a @@ -283,7 +283,7 @@ define void @add_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -300,7 +300,7 @@ define void @add_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i32>, <32 x i32>* %a @@ -317,7 +317,7 @@ define void @add_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: add [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x i32>, <64 x i32>* %a @@ -350,7 +350,7 @@ define void @add_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <4 x i64>, <4 x i64>* %a @@ -367,7 +367,7 @@ define void @add_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -384,7 +384,7 @@ define void @add_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -401,7 +401,7 @@ define void @add_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: add [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: add [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i64>, <32 x i64>* %a @@ -771,7 +771,7 @@ define void @sub_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i8>, <32 x i8>* %a @@ -786,7 +786,7 @@ define void @sub_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x i8>, <64 x i8>* %a @@ -801,7 +801,7 @@ define void @sub_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK: ret %op1 = load <128 x i8>, <128 x i8>* %a @@ -816,7 +816,7 @@ define void @sub_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,256)]] ; CHECK-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0] ; CHECK-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b +; CHECK: sub [[RES:z[0-9]+]].b, [[OP1]].b, [[OP2]].b ; CHECK: st1b { [[RES]].b }, [[PG]], [x0] ; CHECK: ret %op1 = load <256 x i8>, <256 x i8>* %a @@ -849,7 +849,7 @@ define void @sub_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -864,7 +864,7 @@ define void @sub_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i16>, <32 x i16>* %a @@ -879,7 +879,7 @@ define void @sub_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x i16>, <64 x i16>* %a @@ -894,7 +894,7 @@ define void @sub_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] ; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] ; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h +; CHECK: sub [[RES:z[0-9]+]].h, [[OP1]].h, [[OP2]].h ; CHECK: st1h { [[RES]].h }, [[PG]], [x0] ; CHECK: ret %op1 = load <128 x i16>, <128 x i16>* %a @@ -927,7 +927,7 @@ define void @sub_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x i32>, <8 x i32>* %a @@ -942,7 +942,7 @@ define void @sub_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -957,7 +957,7 @@ define void @sub_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i32>, <32 x i32>* %a @@ -972,7 +972,7 @@ define void @sub_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] ; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] ; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s +; CHECK: sub [[RES:z[0-9]+]].s, [[OP1]].s, [[OP2]].s ; CHECK: st1w { [[RES]].s }, [[PG]], [x0] ; CHECK: ret %op1 = load <64 x i32>, <64 x i32>* %a @@ -1005,7 +1005,7 @@ define void @sub_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <4 x i64>, <4 x i64>* %a @@ -1020,7 +1020,7 @@ define void @sub_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -1035,7 +1035,7 @@ define void @sub_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <16 x i64>, <16 x i64>* %a @@ -1050,7 +1050,7 @@ define void @sub_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { ; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] ; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] ; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] -; CHECK: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d +; CHECK: sub [[RES:z[0-9]+]].d, [[OP1]].d, [[OP2]].d ; CHECK: st1d { [[RES]].d }, [[PG]], [x0] ; CHECK: ret %op1 = load <32 x i64>, <32 x i64>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 26a5537dea1ca..bfe4b47242c35 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -80,8 +80,8 @@ define void @sext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 { ; NOTE: Extra 'add' is to prevent the extend being combined with the load. define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i16: -; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_512-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -94,8 +94,8 @@ define void @sext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { ; CHECK-LABEL: sext_v64i8_v64i16: -; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -108,8 +108,8 @@ define void @sext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { define void @sext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { ; CHECK-LABEL: sext_v128i8_v128i16: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret @@ -162,8 +162,8 @@ define void @sext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i32: -; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_1024-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] @@ -177,8 +177,8 @@ define void @sext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { define void @sext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: sext_v64i8_v64i32: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] @@ -238,8 +238,8 @@ define void @sext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { define void @sext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i8_v32i64: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: sunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s @@ -269,8 +269,8 @@ define void @sext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 { define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { ; CHECK-LABEL: sext_v16i16_v16i32: -; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_512-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -283,8 +283,8 @@ define void @sext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: sext_v32i16_v32i32: -; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -297,8 +297,8 @@ define void @sext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { define void @sext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: sext_v64i16_v64i32: -; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret @@ -339,8 +339,8 @@ define void @sext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 { define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: sext_v16i16_v16i64: -; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_1024-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] @@ -354,8 +354,8 @@ define void @sext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { define void @sext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i16_v32i64: -; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_2048-NEXT: sunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] @@ -384,8 +384,8 @@ define void @sext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 { define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { ; CHECK-LABEL: sext_v8i32_v8i64: -; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_512-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -398,8 +398,8 @@ define void @sext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: sext_v16i32_v16i64: -; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_1024-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -412,8 +412,8 @@ define void @sext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { define void @sext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: sext_v32i32_v32i64: -; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_2048-NEXT: sunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret @@ -442,8 +442,8 @@ define void @zext_v16i8_v16i16(<16 x i8> %a, <16 x i16>* %out) #0 { ; NOTE: Extra 'add' is to prevent the extend being combined with the load. define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i16: -; VBITS_GE_512: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_512-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_512-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -456,8 +456,8 @@ define void @zext_v32i8_v32i16(<32 x i8>* %in, <32 x i16>* %out) #0 { define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { ; CHECK-LABEL: zext_v64i8_v64i16: -; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -470,8 +470,8 @@ define void @zext_v64i8_v64i16(<64 x i8>* %in, <64 x i16>* %out) #0 { define void @zext_v128i8_v128i16(<128 x i8>* %in, <128 x i16>* %out) #0 { ; CHECK-LABEL: zext_v128i8_v128i16: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: st1h { [[A_HALFS]].h }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret @@ -524,8 +524,8 @@ define void @zext_v16i8_v16i32(<16 x i8> %a, <16 x i32>* %out) #0 { define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i32: -; VBITS_GE_1024: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_1024-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] @@ -539,8 +539,8 @@ define void @zext_v32i8_v32i32(<32 x i8>* %in, <32 x i32>* %out) #0 { define void @zext_v64i8_v64i32(<64 x i8>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: zext_v64i8_v64i32: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] @@ -600,8 +600,8 @@ define void @zext_v16i8_v16i64(<16 x i8> %a, <16 x i64>* %out) #0 { define void @zext_v32i8_v32i64(<32 x i8>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i8_v32i64: -; VBITS_GE_2048: add [[A_BYTES:z[0-9]+]].b, {{p[0-9]+}}/m, {{z[0-9]+}}.b, {{z[0-9]+}}.b -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_BYTES:z[0-9]+]].b, {{z[0-9]+}}.b, {{z[0-9]+}}.b ; VBITS_GE_2048-NEXT: uunpklo [[A_HALFS:z[0-9]+]].h, [[A_BYTES]].b ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s @@ -631,8 +631,8 @@ define void @zext_v8i16_v8i32(<8 x i16> %a, <8 x i32>* %out) #0 { define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { ; CHECK-LABEL: zext_v16i16_v16i32: -; VBITS_GE_512: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_512-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_512-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -645,8 +645,8 @@ define void @zext_v16i16_v16i32(<16 x i16>* %in, <16 x i32>* %out) #0 { define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { ; CHECK-LABEL: zext_v32i16_v32i32: -; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -659,8 +659,8 @@ define void @zext_v32i16_v32i32(<32 x i16>* %in, <32 x i32>* %out) #0 { define void @zext_v64i16_v64i32(<64 x i16>* %in, <64 x i32>* %out) #0 { ; CHECK-LABEL: zext_v64i16_v64i32: -; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: st1w { [[A_WORDS]].s }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret @@ -701,8 +701,8 @@ define void @zext_v8i16_v8i64(<8 x i16> %a, <8 x i64>* %out) #0 { define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: zext_v16i16_v16i64: -; VBITS_GE_1024: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_1024-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] @@ -716,8 +716,8 @@ define void @zext_v16i16_v16i64(<16 x i16>* %in, <16 x i64>* %out) #0 { define void @zext_v32i16_v32i64(<32 x i16>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i16_v32i64: -; VBITS_GE_2048: add [[A_HALFS:z[0-9]+]].h, {{p[0-9]+}}/m, {{z[0-9]+}}.h, {{z[0-9]+}}.h -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_HALFS:z[0-9]+]].h, {{z[0-9]+}}.h, {{z[0-9]+}}.h ; VBITS_GE_2048-NEXT: uunpklo [[A_WORDS:z[0-9]+]].s, [[A_HALFS]].h ; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] @@ -746,8 +746,8 @@ define void @zext_v4i32_v4i64(<4 x i32> %a, <4 x i64>* %out) #0 { define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { ; CHECK-LABEL: zext_v8i32_v8i64: -; VBITS_GE_512: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_512-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_512-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_512-NEXT: ret @@ -760,8 +760,8 @@ define void @zext_v8i32_v8i64(<8 x i32>* %in, <8 x i64>* %out) #0 { define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { ; CHECK-LABEL: zext_v16i32_v16i64: -; VBITS_GE_1024: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_1024-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_1024-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_1024-NEXT: ret @@ -774,8 +774,8 @@ define void @zext_v16i32_v16i64(<16 x i32>* %in, <16 x i64>* %out) #0 { define void @zext_v32i32_v32i64(<32 x i32>* %in, <32 x i64>* %out) #0 { ; CHECK-LABEL: zext_v32i32_v32i64: -; VBITS_GE_2048: add [[A_WORDS:z[0-9]+]].s, {{p[0-9]+}}/m, {{z[0-9]+}}.s, {{z[0-9]+}}.s -; VBITS_GE_2048-NEXT: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: add [[A_WORDS:z[0-9]+]].s, {{z[0-9]+}}.s, {{z[0-9]+}}.s ; VBITS_GE_2048-NEXT: uunpklo [[A_DWORDS:z[0-9]+]].d, [[A_WORDS]].s ; VBITS_GE_2048-NEXT: st1d { [[A_DWORDS]].d }, [[PG]], [x1] ; VBITS_GE_2048-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll index dda17f33e9a7a..f6bc72f692467 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-immediates.ll @@ -15,9 +15,8 @@ define void @add_v64i8(<64 x i8>* %a) #0 { ; CHECK-LABEL: add_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: mov z1.b, #7 // =0x7 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: add z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: add z0.b, z0.b, #7 // =0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a @@ -32,9 +31,8 @@ define void @add_v32i16(<32 x i16>* %a) #0 { ; CHECK-LABEL: add_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: mov z1.h, #15 // =0xf ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: add z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, #15 // =0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a @@ -49,9 +47,8 @@ define void @add_v16i32(<16 x i32>* %a) #0 { ; CHECK-LABEL: add_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: mov z1.s, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, #31 // =0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -66,9 +63,8 @@ define void @add_v8i64(<8 x i64>* %a) #0 { ; CHECK-LABEL: add_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: mov z1.d, #63 // =0x3f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: add z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: add z0.d, z0.d, #63 // =0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a @@ -719,9 +715,8 @@ define void @sub_v64i8(<64 x i8>* %a) #0 { ; CHECK-LABEL: sub_v64i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: mov z1.b, #7 // =0x7 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: sub z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: sub z0.b, z0.b, #7 // =0x7 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a @@ -736,9 +731,8 @@ define void @sub_v32i16(<32 x i16>* %a) #0 { ; CHECK-LABEL: sub_v32i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: mov z1.h, #15 // =0xf ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: sub z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: sub z0.h, z0.h, #15 // =0xf ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a @@ -753,9 +747,8 @@ define void @sub_v16i32(<16 x i32>* %a) #0 { ; CHECK-LABEL: sub_v16i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: mov z1.s, #31 // =0x1f ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: sub z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: sub z0.s, z0.s, #31 // =0x1f ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -770,9 +763,8 @@ define void @sub_v8i64(<8 x i64>* %a) #0 { ; CHECK-LABEL: sub_v8i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: mov z1.d, #63 // =0x3f ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: sub z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: sub z0.d, z0.d, #63 // =0x3f ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll index 6f419138ae3d3..9ce3873af7743 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll @@ -67,7 +67,7 @@ define i8 @uaddv_v64i8(<64 x i8>* %a) #0 { ; VBITS_EQ_256-DAG: mov w[[NUMELTS:[0-9]+]], #32 ; VBITS_EQ_256-DAG: ld1b { [[LO:z[0-9]+]].b }, [[PG]]/z, [x0] ; VBITS_EQ_256-DAG: ld1b { [[HI:z[0-9]+]].b }, [[PG]]/z, [x0, x[[NUMELTS]]] -; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[PG]]/m, [[HI]].b, [[LO]].b +; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].b, [[LO]].b, [[HI]].b ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].b ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_EQ_256-NEXT: ret @@ -143,7 +143,7 @@ define i16 @uaddv_v32i16(<32 x i16>* %a) #0 { ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #16 ; VBITS_EQ_256-DAG: ld1h { [[LO:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_EQ_256-DAG: ld1h { [[HI:z[0-9]+]].h }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #1] -; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[PG]]/m, [[HI]].h, [[LO]].h +; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].h, [[LO]].h, [[HI]].h ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].h ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_EQ_256-NEXT: ret @@ -219,7 +219,7 @@ define i32 @uaddv_v16i32(<16 x i32>* %a) #0 { ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #8 ; VBITS_EQ_256-DAG: ld1w { [[LO:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_EQ_256-DAG: ld1w { [[HI:z[0-9]+]].s }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #2] -; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[PG]]/m, [[HI]].s, [[LO]].s +; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].s, [[LO]].s, [[HI]].s ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].s ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_EQ_256-NEXT: ret @@ -295,7 +295,7 @@ define i64 @uaddv_v8i64(<8 x i64>* %a) #0 { ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_EQ_256-DAG: ld1d { [[LO:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_EQ_256-DAG: ld1d { [[HI:z[0-9]+]].d }, [[PG]]/z, [x0, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[PG]]/m, [[HI]].d, [[LO]].d +; VBITS_EQ_256-DAG: add [[ADD:z[0-9]+]].d, [[LO]].d, [[HI]].d ; VBITS_EQ_256-DAG: addv [[REDUCE:d[0-9]+]], [[PG]], [[ADD]].d ; VBITS_EQ_256-NEXT: fmov x0, [[REDUCE]] ; VBITS_EQ_256-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index a3aeed9c4aed8..3626aa915541e 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -154,9 +154,8 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_256-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_256-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_256-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 @@ -173,9 +172,8 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_512-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_512-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_512-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_512-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 @@ -189,9 +187,8 @@ define void @srem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_GE_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_GE_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -227,9 +224,8 @@ define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_512-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_512-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_512-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 @@ -246,9 +242,8 @@ define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_EQ_1024-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 @@ -262,9 +257,8 @@ define void @srem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_GE_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_GE_2048-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_2048-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b @@ -300,9 +294,8 @@ define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 @@ -319,9 +312,8 @@ define void @srem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; VBITS_EQ_2048-NEXT: sdivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_2048-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_2048-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b @@ -357,9 +349,8 @@ define void @srem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; VBITS_EQ_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h ; VBITS_EQ_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b -; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].b, [[PG]]/m, [[OP2]].b, [[ZIP]].b -; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[MUL]].b -; VBITS_EQ_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b +; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG]], [x0] ; VBITS_EQ_2048-NEXT: ret %op1 = load <256 x i8>, <256 x i8>* %a %op2 = load <256 x i8>, <256 x i8>* %b @@ -442,9 +433,8 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_256-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_256-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_256-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_256-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 @@ -455,9 +445,8 @@ define void @srem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <16 x i16>, <16 x i16>* %a @@ -483,9 +472,8 @@ define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_512-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 @@ -496,9 +484,8 @@ define void @srem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_GE_1024-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_1024-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b @@ -521,9 +508,8 @@ define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_1024-NEXT: sdiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_1024-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 @@ -534,9 +520,8 @@ define void @srem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; VBITS_GE_2048-NEXT: sunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: sdivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_2048-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_2048-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_2048-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b @@ -559,9 +544,8 @@ define void @srem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; VBITS_EQ_2048-NEXT: movprfx [[OP3_LO:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_2048-NEXT: sdiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[OP3_LO]].s, [[OP2_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].h, [[PG]]/m, [[OP2]].h, [[ZIP]].h -; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[MUL]].h -; VBITS_EQ_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h +; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0] ; VBITS_EQ_2048-NEXT: ret %op1 = load <128 x i16>, <128 x i16>* %a %op2 = load <128 x i16>, <128 x i16>* %b @@ -617,9 +601,8 @@ define void @srem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; CHECK-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; CHECK-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b @@ -635,9 +618,8 @@ define void @srem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_512-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_512-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b @@ -653,9 +635,8 @@ define void @srem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_1024-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_1024-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b @@ -671,9 +652,8 @@ define void @srem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_2048-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_2048-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x i32>, <64 x i32>* %a %op2 = load <64 x i32>, <64 x i32>* %b @@ -735,9 +715,8 @@ define void @srem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { ; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; CHECK-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; CHECK-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; CHECK-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %op2 = load <4 x i64>, <4 x i64>* %b @@ -753,9 +732,8 @@ define void @srem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_512-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_512-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_512-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %op2 = load <8 x i64>, <8 x i64>* %b @@ -771,9 +749,8 @@ define void @srem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_1024-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_1024-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_1024-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a %op2 = load <16 x i64>, <16 x i64>* %b @@ -789,9 +766,8 @@ define void @srem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_2048-NEXT: sdiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_2048-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_2048-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a %op2 = load <32 x i64>, <32 x i64>* %b @@ -937,9 +913,8 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_256-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_256-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_256-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_256-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_256-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_256-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_256-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_512: ptrue [[PG1:p[0-9]+]].b, vl32 @@ -956,9 +931,8 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_512-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_512-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_512-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_512-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].b, vl32 @@ -972,9 +946,8 @@ define void @urem_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_GE_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_GE_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_GE_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b @@ -1010,9 +983,8 @@ define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_512-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_512-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_512-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_512-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_512-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_1024: ptrue [[PG1:p[0-9]+]].b, vl64 @@ -1029,9 +1001,8 @@ define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_EQ_1024-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; QUARTER VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].b, vl64 @@ -1045,9 +1016,8 @@ define void @urem_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_GE_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_GE_2048-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_GE_2048-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_GE_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b @@ -1083,9 +1053,8 @@ define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[DIV4]].h, [[DIV3]].h ; VBITS_EQ_1024-NEXT: uzp1 [[UZP3:z[0-9]+]].b, [[UZP2]].b, [[UZP1]].b -; VBITS_EQ_1024-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP3]].b -; VBITS_EQ_1024-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_1024-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP3]].b, [[OP2]].b +; VBITS_EQ_1024-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; HALF VECTOR: ; VBITS_EQ_2048: ptrue [[PG1:p[0-9]+]].b, vl128 @@ -1102,9 +1071,8 @@ define void @urem_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { ; VBITS_EQ_2048-NEXT: udivr [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO_LO]].s, [[OP1_LO_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h ; VBITS_EQ_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].b, [[UZP1]].b, [[UZP1]].b -; VBITS_EQ_2048-NEXT: mul [[OP2]].b, [[PG1]]/m, [[OP2]].b, [[UZP2]].b -; VBITS_EQ_2048-NEXT: sub [[OP1]].b, [[PG1]]/m, [[OP1]].b, [[OP2]].b -; VBITS_EQ_2048-NEXT: st1b { [[OP1:z[0-9]+]].b }, [[PG1]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG1]]/m, [[UZP2]].b, [[OP2]].b +; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG1]], [x0] ; CHECK: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b @@ -1138,9 +1106,8 @@ define void @urem_v256i8(<256 x i8>* %a, <256 x i8>* %b) #0 { ; VBITS_EQ_2048-NEXT: uzp1 [[RES_HI:z[0-9]+]].h, [[RES_HI_LO]].h, [[RES_HI_HI]].h ; VBITS_EQ_2048-NEXT: uzp1 [[RES_LO:z[0-9]+]].h, [[RES_LO_LO]].h, [[RES_LO_HI]].h ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].b, [[RES_LO]].b, [[RES_HI]].b -; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].b, [[PG]]/m, [[OP2]].b, [[ZIP]].b -; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[MUL]].b -; VBITS_EQ_2048-NEXT: st1b { [[RES]].b }, [[PG]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].b, [[PG]]/m, [[ZIP]].b, [[OP2]].b +; VBITS_EQ_2048-NEXT: st1b { [[OP1]].b }, [[PG]], [x0] ; VBITS_EQ_2048-NEXT: ret %op1 = load <256 x i8>, <256 x i8>* %a %op2 = load <256 x i8>, <256 x i8>* %b @@ -1223,9 +1190,8 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_EQ_256-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_256-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_256-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_256-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_256-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_256-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_256-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_256-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_512: ptrue [[PG1:p[0-9]+]].h, vl16 @@ -1236,9 +1202,8 @@ define void @urem_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_GE_512-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_512-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b @@ -1263,9 +1228,8 @@ define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_512-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_512-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_512-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_512-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_512-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_512-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_512-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_512-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_1024: ptrue [[PG1:p[0-9]+]].h, vl32 @@ -1276,9 +1240,8 @@ define void @urem_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_GE_1024-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_1024-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_1024-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b @@ -1301,9 +1264,8 @@ define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; VBITS_EQ_1024-NEXT: movprfx [[OP1_LO_:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_1024-NEXT: udiv [[DIV2:z[0-9]+]].s, [[PG2]]/m, [[OP1_LO_]].s, [[OP2_LO]].s ; VBITS_EQ_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV2]].h, [[DIV1]].h -; VBITS_EQ_1024-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_EQ_1024-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_EQ_1024-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_EQ_1024-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_EQ_1024-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; HALF VECTOR OR SMALLER: ; VBITS_GE_2048: ptrue [[PG1:p[0-9]+]].h, vl64 @@ -1314,9 +1276,8 @@ define void @urem_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { ; VBITS_GE_2048-NEXT: uunpklo [[OP1_LO:z[0-9]+]].s, [[OP1]].h ; VBITS_GE_2048-NEXT: udivr [[DIV1:z[0-9]+]].s, [[PG2]]/m, [[OP2_LO]].s, [[OP1_LO]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].h, [[DIV1]].h, [[DIV1]].h -; VBITS_GE_2048-NEXT: mul [[OP2]].h, [[PG1]]/m, [[OP2]].h, [[UZP1]].h -; VBITS_GE_2048-NEXT: sub [[OP1]].h, [[PG1]]/m, [[OP1]].h, [[OP2]].h -; VBITS_GE_2048-NEXT: st1h { [[OP1:z[0-9]+]].h }, [[PG1]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].h, [[PG1]]/m, [[UZP1]].h, [[OP2]].h +; VBITS_GE_2048-NEXT: st1h { [[OP1]].h }, [[PG1]], [x0] ; CHECK: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b @@ -1339,9 +1300,8 @@ define void @urem_v128i16(<128 x i16>* %a, <128 x i16>* %b) #0 { ; VBITS_EQ_2048-NEXT: movprfx [[RES_LO:z[0-9]+]], [[OP1_LO]] ; VBITS_EQ_2048-NEXT: udiv [[RES_LO:z[0-9]+]].s, [[PG1]]/m, [[RES_LO]].s, [[OP2_LO]].s ; VBITS_EQ_2048-NEXT: uzp1 [[ZIP:z[0-9]+]].h, [[RES_LO]].h, [[RES_HI]].h -; VBITS_EQ_2048-NEXT: mul [[MUL:z[0-9]+]].h, [[PG]]/m, [[OP2]].h, [[ZIP]].h -; VBITS_EQ_2048-NEXT: sub [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[MUL]].h -; VBITS_EQ_2048-NEXT: st1h { [[RES]].h }, [[PG]], [x0] +; VBITS_EQ_2048-NEXT: mls [[OP1]].h, [[PG]]/m, [[ZIP]].h, [[OP2]].h +; VBITS_EQ_2048-NEXT: st1h { [[OP1]].h }, [[PG]], [x0] ; VBITS_EQ_2048-NEXT: ret %op1 = load <128 x i16>, <128 x i16>* %a %op2 = load <128 x i16>, <128 x i16>* %b @@ -1397,9 +1357,8 @@ define void @urem_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; CHECK-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; CHECK-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; CHECK-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; CHECK-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; CHECK-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; CHECK-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; CHECK-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b @@ -1415,9 +1374,8 @@ define void @urem_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { ; VBITS_GE_512-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_512-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_512-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_512-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_512-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b @@ -1433,9 +1391,8 @@ define void @urem_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_1024-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_1024-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_1024-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_1024-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b @@ -1451,9 +1408,8 @@ define void @urem_v64i32(<64 x i32>* %a, <64 x i32>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] ; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].s, [[PG]]/m, [[PFX]].s, [[OP2]].s -; VBITS_GE_2048-NEXT: mul [[MUL:z[0-9]+]].s, [[PG]]/m, [[OP2]].s, [[DIV]].s -; VBITS_GE_2048-NEXT: sub [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[MUL]].s -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].s, [[PG]]/m, [[DIV]].s, [[OP2]].s +; VBITS_GE_2048-NEXT: st1w { [[OP1]].s }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <64 x i32>, <64 x i32>* %a %op2 = load <64 x i32>, <64 x i32>* %b @@ -1515,9 +1471,8 @@ define void @urem_v4i64(<4 x i64>* %a, <4 x i64>* %b) #0 { ; CHECK-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; CHECK-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; CHECK-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; CHECK-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; CHECK-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; CHECK-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; CHECK-NEXT: ret %op1 = load <4 x i64>, <4 x i64>* %a %op2 = load <4 x i64>, <4 x i64>* %b @@ -1533,9 +1488,8 @@ define void @urem_v8i64(<8 x i64>* %a, <8 x i64>* %b) #0 { ; VBITS_GE_512-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_512-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_512-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_512-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_512-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_512-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_512-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i64>, <8 x i64>* %a %op2 = load <8 x i64>, <8 x i64>* %b @@ -1551,9 +1505,8 @@ define void @urem_v16i64(<16 x i64>* %a, <16 x i64>* %b) #0 { ; VBITS_GE_1024-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_1024-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_1024-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_1024-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_1024-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_1024-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_1024-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_1024-NEXT: ret %op1 = load <16 x i64>, <16 x i64>* %a %op2 = load <16 x i64>, <16 x i64>* %b @@ -1569,9 +1522,8 @@ define void @urem_v32i64(<32 x i64>* %a, <32 x i64>* %b) #0 { ; VBITS_GE_2048-NEXT: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_2048-NEXT: movprfx [[PFX:z[0-9]+]], [[OP1]] ; VBITS_GE_2048-NEXT: udiv [[DIV:z[0-9]+]].d, [[PG]]/m, [[PFX]].d, [[OP2]].d -; VBITS_GE_2048-NEXT: mul [[MUL:z[0-9]+]].d, [[PG]]/m, [[OP2]].d, [[DIV]].d -; VBITS_GE_2048-NEXT: sub [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[MUL]].d -; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG]], [x0] +; VBITS_GE_2048-NEXT: mls [[OP1]].d, [[PG]]/m, [[DIV]].d, [[OP2]].d +; VBITS_GE_2048-NEXT: st1d { [[OP1]].d }, [[PG]], [x0] ; VBITS_GE_2048-NEXT: ret %op1 = load <32 x i64>, <32 x i64>* %a %op2 = load <32 x i64>, <32 x i64>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll index 3090dc6edda54..38bf194d442a1 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-limit-duplane.ll @@ -11,9 +11,9 @@ define <4 x i32> @test(<16 x i32>* %arg1, <16 x i32>* %arg2) { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: add z2.s, p0/m, z2.s, z2.s +; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #16 -; CHECK-NEXT: add z1.s, p0/m, z1.s, z1.s +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: dup v0.4s, v0.s[2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z2.s }, p0, [x0] @@ -35,9 +35,9 @@ define <2 x i32> @test2(<16 x i32>* %arg1, <16 x i32>* %arg2) { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: mov z0.d, z1.d -; CHECK-NEXT: add z2.s, p0/m, z2.s, z2.s +; CHECK-NEXT: add z2.s, z2.s, z2.s ; CHECK-NEXT: ext z0.b, z0.b, z1.b, #24 -; CHECK-NEXT: add z1.s, p0/m, z1.s, z1.s +; CHECK-NEXT: add z1.s, z1.s, z1.s ; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z2.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index b79db68bd3db9..0b76f0bbf46d3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -1165,10 +1165,9 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %o ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, x2 -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: movprfx z0, z1 -; VBITS_GE_2048-NEXT: add z0.d, p1/m, z0.d, z2.d -; VBITS_GE_2048-NEXT: punpklo p1.h, p2.b +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: add z0.d, z1.d, z2.d +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] @@ -1191,12 +1190,10 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 -; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: movprfx z0, z1 -; VBITS_GE_2048-NEXT: add z0.d, p1/m, z0.d, z2.d -; VBITS_GE_2048-NEXT: punpklo p1.h, p2.b -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_2048-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 +; VBITS_GE_2048-NEXT: add z1.d, z1.d, #4 +; VBITS_GE_2048-NEXT: punpklo p1.h, p1.b +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_2048-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 22cd17bc5893a..a2dc244c848bd 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -1065,7 +1065,7 @@ define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 % ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z2.d, x2 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d +; VBITS_GE_2048-NEXT: add z1.d, z1.d, z2.d ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] @@ -1087,9 +1087,8 @@ define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ; VBITS_GE_2048-NEXT: ptrue p1.d, vl32 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] -; VBITS_GE_2048-NEXT: mov z2.d, #4 // =0x4 ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 -; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d +; VBITS_GE_2048-NEXT: add z1.d, z1.d, #4 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: punpklo p0.h, p0.b ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll index 0edf6a44b96f3..d7dad2d226096 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-optimize-ptrue.ll @@ -9,7 +9,7 @@ define void @add_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] -; CHECK-NEXT: add z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: add z0.b, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a @@ -25,7 +25,7 @@ define void @add_v32i16(<32 x i16>* %a, <32 x i16>* %b, <32 x i16>* %c) #0 { ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] -; CHECK-NEXT: add z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: add z0.h, z0.h, z1.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll index f42b4c0e999fb..2bf85201321ac 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-permute-zip-uzp-trn.ll @@ -42,9 +42,8 @@ define void @zip_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_256-NEXT: zip1 z1.h, z1.h, z3.h ; VBITS_EQ_256-NEXT: zip2 z3.h, z0.h, z2.h ; VBITS_EQ_256-NEXT: zip1 z0.h, z0.h, z2.h -; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: movprfx z1, z4 -; VBITS_EQ_256-NEXT: add z1.h, p0/m, z1.h, z3.h +; VBITS_EQ_256-NEXT: add z0.h, z1.h, z0.h +; VBITS_EQ_256-NEXT: add z1.h, z4.h, z3.h ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret @@ -56,7 +55,7 @@ define void @zip_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_EQ_512-NEXT: zip1 z2.h, z0.h, z1.h ; VBITS_EQ_512-NEXT: zip2 z0.h, z0.h, z1.h -; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: add z0.h, z2.h, z0.h ; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <32 x i16>, <32 x i16>* %a @@ -214,7 +213,7 @@ define void @trn_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_EQ_256-NEXT: trn1 z2.b, z0.b, z1.b ; VBITS_EQ_256-NEXT: trn2 z0.b, z0.b, z1.b -; VBITS_EQ_256-NEXT: add z0.b, p0/m, z0.b, z2.b +; VBITS_EQ_256-NEXT: add z0.b, z2.b, z0.b ; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; @@ -225,7 +224,7 @@ define void @trn_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { ; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.b, z0.b, z1.b ; VBITS_EQ_512-NEXT: trn2 z0.b, z0.b, z1.b -; VBITS_EQ_512-NEXT: add z0.b, p0/m, z0.b, z2.b +; VBITS_EQ_512-NEXT: add z0.b, z2.b, z0.b ; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <32 x i8>, <32 x i8>* %a @@ -250,8 +249,8 @@ define void @trn_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_256-NEXT: trn1 z5.h, z1.h, z3.h ; VBITS_EQ_256-NEXT: trn2 z0.h, z0.h, z2.h ; VBITS_EQ_256-NEXT: trn2 z1.h, z1.h, z3.h -; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z4.h -; VBITS_EQ_256-NEXT: add z1.h, p0/m, z1.h, z5.h +; VBITS_EQ_256-NEXT: add z0.h, z4.h, z0.h +; VBITS_EQ_256-NEXT: add z1.h, z5.h, z1.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] ; VBITS_EQ_256-NEXT: st1h { z1.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret @@ -263,7 +262,7 @@ define void @trn_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { ; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.h, z0.h, z1.h ; VBITS_EQ_512-NEXT: trn2 z0.h, z0.h, z1.h -; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: add z0.h, z2.h, z0.h ; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <32 x i16>, <32 x i16>* %a @@ -283,7 +282,7 @@ define void @trn_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_EQ_256-NEXT: trn1 z2.h, z0.h, z1.h ; VBITS_EQ_256-NEXT: trn2 z0.h, z0.h, z1.h -; VBITS_EQ_256-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_256-NEXT: add z0.h, z2.h, z0.h ; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; @@ -294,7 +293,7 @@ define void @trn_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { ; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.h, z0.h, z1.h ; VBITS_EQ_512-NEXT: trn2 z0.h, z0.h, z1.h -; VBITS_EQ_512-NEXT: add z0.h, p0/m, z0.h, z2.h +; VBITS_EQ_512-NEXT: add z0.h, z2.h, z0.h ; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <16 x i16>, <16 x i16>* %a @@ -314,7 +313,7 @@ define void @trn_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_EQ_256-NEXT: trn1 z2.s, z0.s, z1.s ; VBITS_EQ_256-NEXT: trn2 z0.s, z0.s, z1.s -; VBITS_EQ_256-NEXT: add z0.s, p0/m, z0.s, z2.s +; VBITS_EQ_256-NEXT: add z0.s, z2.s, z0.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; @@ -325,7 +324,7 @@ define void @trn_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { ; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_EQ_512-NEXT: trn1 z2.s, z0.s, z1.s ; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z1.s -; VBITS_EQ_512-NEXT: add z0.s, p0/m, z0.s, z2.s +; VBITS_EQ_512-NEXT: add z0.s, z2.s, z0.s ; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <8 x i32>, <8 x i32>* %a @@ -395,7 +394,7 @@ define void @trn_v8i32_undef(<8 x i32>* %a) #0 { ; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_256-NEXT: trn1 z1.s, z0.s, z0.s ; VBITS_EQ_256-NEXT: trn2 z0.s, z0.s, z0.s -; VBITS_EQ_256-NEXT: add z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_256-NEXT: add z0.s, z1.s, z0.s ; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_256-NEXT: ret ; @@ -405,7 +404,7 @@ define void @trn_v8i32_undef(<8 x i32>* %a) #0 { ; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_EQ_512-NEXT: trn1 z1.s, z0.s, z0.s ; VBITS_EQ_512-NEXT: trn2 z0.s, z0.s, z0.s -; VBITS_EQ_512-NEXT: add z0.s, p0/m, z0.s, z1.s +; VBITS_EQ_512-NEXT: add z0.s, z1.s, z0.s ; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_EQ_512-NEXT: ret %tmp1 = load <8 x i32>, <8 x i32>* %a @@ -495,7 +494,7 @@ define void @uzp_v32i8(<32 x i8>* %a, <32 x i8>* %b) #1 { ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: uzp1 z2.b, z0.b, z1.b ; CHECK-NEXT: uzp2 z0.b, z0.b, z1.b -; CHECK-NEXT: add z0.b, p0/m, z0.b, z2.b +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %tmp1 = load <32 x i8>, <32 x i8>* %a @@ -521,11 +520,10 @@ define void @uzp_v32i16(<32 x i16>* %a, <32 x i16>* %b) #1 { ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x1] ; CHECK-NEXT: uzp1 z5.h, z1.h, z0.h ; CHECK-NEXT: uzp2 z0.h, z1.h, z0.h -; CHECK-NEXT: add z0.h, p0/m, z0.h, z5.h ; CHECK-NEXT: uzp1 z4.h, z3.h, z2.h ; CHECK-NEXT: uzp2 z2.h, z3.h, z2.h -; CHECK-NEXT: movprfx z1, z4 -; CHECK-NEXT: add z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: add z0.h, z5.h, z0.h +; CHECK-NEXT: add z1.h, z4.h, z2.h ; CHECK-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -548,7 +546,7 @@ define void @uzp_v16i16(<16 x i16>* %a, <16 x i16>* %b) #1 { ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: uzp1 z2.h, z0.h, z1.h ; CHECK-NEXT: uzp2 z0.h, z0.h, z1.h -; CHECK-NEXT: add z0.h, p0/m, z0.h, z2.h +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %tmp1 = load <16 x i16>, <16 x i16>* %a @@ -592,7 +590,7 @@ define void @uzp_v4i64(<4 x i64>* %a, <4 x i64>* %b) #1 { ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: uzp1 z2.d, z0.d, z1.d ; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d -; CHECK-NEXT: add z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: add z0.d, z2.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, <4 x i64>* %a @@ -632,7 +630,7 @@ define void @uzp_v8i32_undef(<8 x i32>* %a) #1 { ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: uzp1 z1.s, z0.s, z0.s ; CHECK-NEXT: uzp2 z0.s, z0.s, z0.s -; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %tmp1 = load <8 x i32>, <8 x i32>* %a diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll index 656a821c446cf..275716e06c235 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc.ll @@ -41,7 +41,7 @@ define void @trunc_v32i16_v32i8(<32 x i16>* %in, <32 x i8>* %out) #0 { ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 ; VBITS_GE_512: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_512: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_512: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_512: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <32 x i16>, <32 x i16>* %in %b = trunc <32 x i16> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -55,7 +55,7 @@ define void @trunc_v64i16_v64i8(<64 x i16>* %in, <64 x i8>* %out) #0 { ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 ; VBITS_GE_1024: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_1024: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <64 x i16>, <64 x i16>* %in %b = trunc <64 x i16> %a to <64 x i8> %c = add <64 x i8> %b, %b @@ -69,7 +69,7 @@ define void @trunc_v128i16_v128i8(<128 x i16>* %in, <128 x i8>* %out) #0 { ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 ; VBITS_GE_2048: ld1h { [[A_HALFS:z[0-9]+]].h }, [[PG]]/z, [x0] ; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <128 x i16>, <128 x i16>* %in %b = trunc <128 x i16> %a to <128 x i8> %c = add <128 x i8> %b, %b @@ -112,7 +112,7 @@ define void @trunc_v32i32_v32i8(<32 x i32>* %in, <32 x i8>* %out) #0 { ; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_1024: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_1024: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_1024: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <32 x i32>, <32 x i32>* %in %b = trunc <32 x i32> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -127,7 +127,7 @@ define void @trunc_v64i32_v64i8(<64 x i32>* %in, <64 x i8>* %out) #0 { ; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <64 x i32>, <64 x i32>* %in %b = trunc <64 x i32> %a to <64 x i8> %c = add <64 x i8> %b, %b @@ -156,7 +156,7 @@ define void @trunc_v16i32_v16i16(<16 x i32>* %in, <16 x i16>* %out) #0 { ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 ; VBITS_GE_512: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_512: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h -; VBITS_GE_512: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h +; VBITS_GE_512: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h %a = load <16 x i32>, <16 x i32>* %in %b = trunc <16 x i32> %a to <16 x i16> %c = add <16 x i16> %b, %b @@ -170,7 +170,7 @@ define void @trunc_v32i32_v32i16(<32 x i32>* %in, <32 x i16>* %out) #0 { ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 ; VBITS_GE_1024: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h -; VBITS_GE_1024: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h +; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h %a = load <32 x i32>, <32 x i32>* %in %b = trunc <32 x i32> %a to <32 x i16> %c = add <32 x i16> %b, %b @@ -184,7 +184,7 @@ define void @trunc_v64i32_v64i16(<64 x i32>* %in, <64 x i16>* %out) #0 { ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 ; VBITS_GE_2048: ld1w { [[A_WORDS:z[0-9]+]].s }, [[PG]]/z, [x0] ; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h -; VBITS_GE_2048: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h +; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h %a = load <64 x i32>, <64 x i32>* %in %b = trunc <64 x i32> %a to <64 x i16> %c = add <64 x i16> %b, %b @@ -243,7 +243,7 @@ define void @trunc_v32i64_v32i8(<32 x i64>* %in, <32 x i8>* %out) #0 { ; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h ; VBITS_GE_2048: uzp1 [[A_BYTES:z[0-9]+]].b, [[A_HALFS]].b, [[A_HALFS]].b -; VBITS_GE_2048: add [[A_BYTES]].b, [[PG]]/m, [[A_BYTES]].b, [[A_BYTES]].b +; VBITS_GE_2048: add [[A_BYTES]].b, [[A_BYTES]].b, [[A_BYTES]].b %a = load <32 x i64>, <32 x i64>* %in %b = trunc <32 x i64> %a to <32 x i8> %c = add <32 x i8> %b, %b @@ -286,7 +286,7 @@ define void @trunc_v16i64_v16i16(<16 x i64>* %in, <16 x i16>* %out) #0 { ; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_1024: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h -; VBITS_GE_1024: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h +; VBITS_GE_1024: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h %a = load <16 x i64>, <16 x i64>* %in %b = trunc <16 x i64> %a to <16 x i16> %c = add <16 x i16> %b, %b @@ -301,7 +301,7 @@ define void @trunc_v32i64_v32i16(<32 x i64>* %in, <32 x i16>* %out) #0 { ; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s ; VBITS_GE_2048: uzp1 [[A_HALFS:z[0-9]+]].h, [[A_WORDS]].h, [[A_WORDS]].h -; VBITS_GE_2048: add [[A_HALFS]].h, [[PG]]/m, [[A_HALFS]].h, [[A_HALFS]].h +; VBITS_GE_2048: add [[A_HALFS]].h, [[A_HALFS]].h, [[A_HALFS]].h %a = load <32 x i64>, <32 x i64>* %in %b = trunc <32 x i64> %a to <32 x i16> %c = add <32 x i16> %b, %b @@ -330,7 +330,7 @@ define void @trunc_v8i64_v8i32(<8 x i64>* %in, <8 x i32>* %out) #0 { ; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 ; VBITS_GE_512: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_512: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s -; VBITS_GE_512: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s +; VBITS_GE_512: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s %a = load <8 x i64>, <8 x i64>* %in %b = trunc <8 x i64> %a to <8 x i32> %c = add <8 x i32> %b, %b @@ -344,7 +344,7 @@ define void @trunc_v16i64_v16i32(<16 x i64>* %in, <16 x i32>* %out) #0 { ; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 ; VBITS_GE_1024: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_1024: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s -; VBITS_GE_1024: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s +; VBITS_GE_1024: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s %a = load <16 x i64>, <16 x i64>* %in %b = trunc <16 x i64> %a to <16 x i32> %c = add <16 x i32> %b, %b @@ -358,7 +358,7 @@ define void @trunc_v32i64_v32i32(<32 x i64>* %in, <32 x i32>* %out) #0 { ; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 ; VBITS_GE_2048: ld1d { [[A_DWORDS:z[0-9]+]].d }, [[PG]]/z, [x0] ; VBITS_GE_2048: uzp1 [[A_WORDS:z[0-9]+]].s, [[A_DWORDS]].s, [[A_DWORDS]].s -; VBITS_GE_2048: add [[A_WORDS]].s, [[PG]]/m, [[A_WORDS]].s, [[A_WORDS]].s +; VBITS_GE_2048: add [[A_WORDS]].s, [[A_WORDS]].s, [[A_WORDS]].s %a = load <32 x i64>, <32 x i64>* %in %b = trunc <32 x i64> %a to <32 x i32> %c = add <32 x i32> %b, %b diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll index 17edf7479bdc4..c93db15b0c7e1 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -50,7 +50,7 @@ define <4 x i64> @test_post_ld1_int_fixed(i64* %data, i64 %idx, <4 x i64>* %addr ; CHECK-NEXT: mov z1.d, z0.d ; CHECK-NEXT: mov z0.d, p2/m, x10 ; CHECK-NEXT: mov z1.d, p1/m, x11 -; CHECK-NEXT: add z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x8] ; CHECK-NEXT: ret %A = load <4 x i64>, <4 x i64>* %addr diff --git a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll index f3ea18c8ff748..2c13eea4ca4fa 100644 --- a/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll +++ b/llvm/test/CodeGen/AArch64/sve-vscale-attr.ll @@ -24,7 +24,7 @@ define void @func_vscale_none(<16 x i32>* %a, <16 x i32>* %b) #0 { ; CHECK-ARG-NEXT: ptrue p0.s, vl16 ; CHECK-ARG-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-ARG-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-ARG-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-ARG-NEXT: add z0.s, z0.s, z1.s ; CHECK-ARG-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-ARG-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -68,8 +68,8 @@ define void @func_vscale2_2(<16 x i32>* %a, <16 x i32>* %b) #2 { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] -; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z3.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret @@ -91,8 +91,8 @@ define void @func_vscale2_4(<16 x i32>* %a, <16 x i32>* %b) #3 { ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] -; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s -; CHECK-NEXT: add z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: add z1.s, z1.s, z3.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: ret @@ -111,7 +111,7 @@ define void @func_vscale4_4(<16 x i32>* %a, <16 x i32>* %b) #4 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a @@ -129,7 +129,7 @@ define void @func_vscale8_8(<16 x i32>* %a, <16 x i32>* %b) #5 { ; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] -; CHECK-NEXT: add z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: add z0.s, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a