diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td index 68343674bc819..9456878946151 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedA510.td +++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td @@ -254,7 +254,7 @@ def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>; // Compute pointer authentication code for data address // Compute pointer authentication code, using generic key // Compute pointer authentication code for instruction address -def : InstRW<[CortexA510Write<3, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; +def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>; // Branch and link, register, with pointer authentication // Branch, register, with pointer authentication @@ -401,30 +401,30 @@ def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr) def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>; def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>; // ASIMD absolute diff accum -def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; +def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>; // ASIMD absolute diff long def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>; // ASIMD arith #1 -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)", - "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)", - "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v", + "[SU]R?HADDv", "[SU]HSUBv")>; // ASIMD arith #2 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$", "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$", - "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$", "ADDPv(2i32|4i16|8i8)$")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$", "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$", - "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$", "ADDPv(16i8|2i64|4i32|8i16)$")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>; // ASIMD arith #3 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", - "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SADDLv", "UADDLv", "SADDWv", + "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDHNv", "SUBHNv")>; // ASIMD arith #5 -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; +def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>; // ASIMD arith, reduce -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ADDVv", "SADDLVv", "UADDLVv")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ADDVv")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "SADDLVv", "UADDLVv")>; // ASIMD compare #1 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>; def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>; @@ -437,8 +437,8 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT| def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8", "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>; // ASIMD max/min, basic -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>; // SIMD max/min, reduce def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>; // ASIMD multiply, by element @@ -467,12 +467,12 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", " // ASIMD polynomial (8x8) multiply long def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>; // ASIMD pairwise add and accumulate -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; +def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>; // ASIMD shift accumulate -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>; // ASIMD shift accumulate #2 -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; +def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>; // ASIMD shift by immed def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv", "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>; @@ -504,7 +504,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>; // Crypto polynomial (64x64) multiply long -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>; // Crypto SHA1 hash acceleration op // Crypto SHA1 schedule acceleration ops @@ -512,25 +512,26 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|S // Crypto SHA1 hash acceleration ops // Crypto SHA256 hash acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>; // Crypto SHA256 schedule acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; +def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>; // Crypto SHA512 hash acceleration ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>; // Crypto SHA3 ops -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3, XAR)>; -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs RAX1)>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>; // Crypto SM3 ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$", "^SM3TT[12][AB]$")>; // Crypto SM4 ops -def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; +def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>; // CRC // ----------------------------------------------------------------------------- @@ -540,25 +541,25 @@ def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")> // SVE Predicate instructions // Loop control, based on predicate -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP, BRKB_PPmP, BRKB_PPzP)>; // Loop control, based on predicate and flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>; // Loop control, propagating -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>; // Loop control, propagating and flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>; // Loop control, based on GPR -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>; // Loop terminate def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>; @@ -569,20 +570,20 @@ def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_X def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CNT[BHWD]_XPiI")>; -def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], +def : InstRW<[CortexA510Write<3, CortexA510UnitALU>], (instregex "^(INC|DEC)[BHWD]_XPiI")>; -def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], +def : InstRW<[CortexA510Write<4, CortexA510UnitALU>], (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>; // Predicate counting scalar, active predicate -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^CNTP_XPP_[BHSD]")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(DEC|INC)P_XP_[BHSD]")>; -def : InstRW<[CortexA510Write<8, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>], (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]", "^(UQDEC|UQINC)P_WP_[BHSD]", "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>; @@ -593,39 +594,39 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>; // Predicate logical -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>; // Predicate logical, flag setting -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>; // Predicate reverse -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>; // Predicate select -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>; // Predicate set -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>; // Predicate set/initialize, set flags -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>; // Predicate find first/next -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>; // Predicate test -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PTEST_PP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>; // Predicate transpose -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>; // Predicate unpack and widen -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>; // Predicate zip/unzip -def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; +def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>; // SVE integer instructions @@ -634,10 +635,10 @@ def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[1 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>; // Arithmetic, absolute diff accum -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; +def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>; // Arithmetic, absolute diff accum long -def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; +def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>; // Arithmetic, absolute diff long def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>; @@ -651,20 +652,22 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], "^(ADD|SUB|SUBR)_ZI_[BHSD]", "^ADR_[SU]XTW_ZZZ_D_[0123]", "^ADR_LSL_ZZZ_[SD]_[0123]", - "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", + "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>; +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], + (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]", "^SADDLBT_ZZZ_[HSD]", - "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]", "^SSUBL(BT|TB)_ZZZ_[HSD]")>; // Arithmetic, complex def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], - (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]", - "^SQ(ABS|NEG)_ZPmZ_[BHSD]", + (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]", "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]", "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]", "^[SU]Q(ADD|SUB)_ZI_[BHSD]", "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]", "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>; +def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], + (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>; // Arithmetic, large integer def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>; @@ -735,14 +738,14 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|B // Count/reverse bits def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>; def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>; def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>; // Broadcast logical bitmask immediate to vector def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>; // Compare and set flags -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]", "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>; @@ -939,12 +942,14 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ // Multiply/multiply long, (8x8) polynomial def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>; -def : InstRW<[CortexA510Write<6, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; +def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>; // Predicate counting vector +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], + (instregex "^(DEC|INC)[HWD]_ZPiI")>; def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], - (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; + (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>; // Reciprocal estimate def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>; @@ -965,7 +970,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MA def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>; // Reverse, vector -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]", "^REVB_ZPmZ_[HSD]", "^REVH_ZPmZ_[SD]", "^REVW_ZPmZ_D")>; @@ -980,13 +985,13 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[B def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>; // Transpose, vector form -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>; // Unpack and extend def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>; // Zip/unzip -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>; // SVE floating-point instructions // ----------------------------------------------------------------------------- @@ -1142,7 +1147,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[H // Floating point trigonometric, miscellaneous def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>; -def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; +def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>; // SVE BFloat16 (BF16) instructions @@ -1251,12 +1256,12 @@ def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>], "^GLD(FF)?1D(_SCALED)?$")>; // Gather load, 32-bit scaled offset -def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], +def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$", "^GLD(FF)?1W_[SU]XTW_SCALED")>; // Gather load, 32-bit unpacked unscaled offset -def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", +def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$", "^GLD(FF)?1W_[SU]XTW$")>; def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>; @@ -1377,12 +1382,12 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_ "^AESI?MC_ZZ_B$")>; // Crypto SHA3 ops -def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", +def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$", "^XAR_ZZZI_[BHSD]$")>; -def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; +def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>; // Crypto SM4 ops -def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; +def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>; } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll index ceef0c49a45ec..9a525151ca328 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll @@ -140,10 +140,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) { ; GISEL-NEXT: umull2 v2.4s, v0.8h, v1.8h ; GISEL-NEXT: umull v1.4s, v0.4h, v1.4h ; GISEL-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; GISEL-NEXT: ldr q2, [x8, :lo12:.LCPI3_0] ; GISEL-NEXT: sub v0.8h, v0.8h, v1.8h ; GISEL-NEXT: usra v1.8h, v0.8h, #1 -; GISEL-NEXT: ldr q0, [x8, :lo12:.LCPI3_0] -; GISEL-NEXT: neg v0.8h, v0.8h +; GISEL-NEXT: neg v0.8h, v2.8h ; GISEL-NEXT: ushl v0.8h, v1.8h, v0.8h ; GISEL-NEXT: ret %1 = udiv <8 x i16> %x, @@ -170,13 +170,13 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { ; GISEL-LABEL: combine_vec_udiv_nonuniform4: ; GISEL: // %bb.0: ; GISEL-NEXT: adrp x8, .LCPI4_2 +; GISEL-NEXT: adrp x9, .LCPI4_0 ; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI4_2] ; GISEL-NEXT: adrp x8, .LCPI4_1 +; GISEL-NEXT: ldr q4, [x9, :lo12:.LCPI4_0] ; GISEL-NEXT: ldr q3, [x8, :lo12:.LCPI4_1] -; GISEL-NEXT: adrp x8, .LCPI4_0 ; GISEL-NEXT: umull2 v2.8h, v0.16b, v1.16b ; GISEL-NEXT: umull v1.8h, v0.8b, v1.8b -; GISEL-NEXT: ldr q4, [x8, :lo12:.LCPI4_0] ; GISEL-NEXT: uzp2 v1.16b, v1.16b, v2.16b ; GISEL-NEXT: neg v2.16b, v3.16b ; GISEL-NEXT: shl v3.16b, v4.16b, #7 diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll index b77d591347659..ee035ec1941d5 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll @@ -101,12 +101,12 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias ; GISEL-NEXT: ushll v2.8h, v2.8b, #0 ; GISEL-NEXT: usubl v3.4s, v1.4h, v2.4h ; GISEL-NEXT: usubl2 v1.4s, v1.8h, v2.8h -; GISEL-NEXT: neg v2.4s, v3.4s -; GISEL-NEXT: neg v4.4s, v1.4s -; GISEL-NEXT: cmgt v5.4s, v0.4s, v3.4s +; GISEL-NEXT: cmgt v2.4s, v0.4s, v3.4s ; GISEL-NEXT: cmgt v0.4s, v0.4s, v1.4s -; GISEL-NEXT: bif v2.16b, v3.16b, v5.16b -; GISEL-NEXT: bsl v0.16b, v4.16b, v1.16b +; GISEL-NEXT: neg v4.4s, v3.4s +; GISEL-NEXT: neg v5.4s, v1.4s +; GISEL-NEXT: bsl v2.16b, v4.16b, v3.16b +; GISEL-NEXT: bsl v0.16b, v5.16b, v1.16b ; GISEL-NEXT: add v0.4s, v2.4s, v0.4s ; GISEL-NEXT: addv s0, v0.4s ; GISEL-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll index fdeae9f326ad8..36b81d8e495ce 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll @@ -4,8 +4,8 @@ define @dupsext_v2i8_v2i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -20,8 +20,8 @@ entry: define @dupsext_v4i8_v4i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -36,8 +36,8 @@ entry: define @dupsext_v8i8_v8i16(i8 %src, %b) { ; CHECK-LABEL: dupsext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -52,8 +52,8 @@ entry: define @dupsext_v2i8_v2i32(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -68,8 +68,8 @@ entry: define @dupsext_v4i8_v4i32(i8 %src, %b) { ; CHECK-LABEL: dupsext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxtb w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -84,9 +84,9 @@ entry: define @dupsext_v2i8_v2i64(i8 %src, %b) { ; CHECK-LABEL: dupsext_v2i8_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtb x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -101,8 +101,8 @@ entry: define @dupsext_v2i16_v2i32(i16 %src, %b) { ; CHECK-LABEL: dupsext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -117,8 +117,8 @@ entry: define @dupsext_v4i16_v4i32(i16 %src, %b) { ; CHECK-LABEL: dupsext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sxth w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -133,9 +133,9 @@ entry: define @dupsext_v2i16_v2i64(i16 %src, %b) { ; CHECK-LABEL: dupsext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxth x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -150,9 +150,9 @@ entry: define @dupsext_v2i32_v2i64(i32 %src, %b) { ; CHECK-LABEL: dupsext_v2i32_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -167,8 +167,8 @@ entry: define @dupzext_v2i8_v2i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -183,8 +183,8 @@ entry: define @dupzext_v4i8_v4i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v4i8_v4i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -199,8 +199,8 @@ entry: define @dupzext_v8i8_v8i16(i8 %src, %b) { ; CHECK-LABEL: dupzext_v8i8_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -215,8 +215,8 @@ entry: define @dupzext_v2i8_v2i32(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -231,8 +231,8 @@ entry: define @dupzext_v4i8_v4i32(i8 %src, %b) { ; CHECK-LABEL: dupzext_v4i8_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -247,9 +247,9 @@ entry: define @dupzext_v2i8_v2i64(i8 %src, %b) { ; CHECK-LABEL: dupzext_v2i8_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -264,8 +264,8 @@ entry: define @dupzext_v2i16_v2i32(i16 %src, %b) { ; CHECK-LABEL: dupzext_v2i16_v2i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -280,8 +280,8 @@ entry: define @dupzext_v4i16_v4i32(i16 %src, %b) { ; CHECK-LABEL: dupzext_v4i16_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -296,9 +296,9 @@ entry: define @dupzext_v2i16_v2i64(i16 %src, %b) { ; CHECK-LABEL: dupzext_v2i16_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -313,8 +313,8 @@ entry: define @dupzext_v2i32_v2i64(i32 %src, %b) { ; CHECK-LABEL: dupzext_v2i32_v2i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 61a4f64ac2bfc..540471a05901a 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -257,8 +257,8 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: ; CHECK-SVE-NEXT: ldrh w8, [x0] -; CHECK-SVE-NEXT: ptrue p0.d, vl2 ; CHECK-SVE-NEXT: ldrh w9, [x0, #2] +; CHECK-SVE-NEXT: ptrue p0.d, vl2 ; CHECK-SVE-NEXT: ldr d0, [x1] ; CHECK-SVE-NEXT: fmov d1, x8 ; CHECK-SVE-NEXT: sshll v0.2d, v0.2s, #0 diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll index a65c5d6667794..43122c8c953fc 100644 --- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll +++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll @@ -131,9 +131,9 @@ define @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: index z0.d, #0, #1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0xff +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-NEXT: and x8, x1, #0xff ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: add z0.d, z0.d, z1.d @@ -153,6 +153,7 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill @@ -160,16 +161,16 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: mov z1.s, w0 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z25.s, w1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.d, z0.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: uqadd z6.s, z0.s, z1.s ; CHECK-NEXT: incw z0.s, all, mul #4 ; CHECK-NEXT: incw z2.s ; CHECK-NEXT: incw z3.s, all, mul #2 -; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z6.s ; CHECK-NEXT: uqadd z0.s, z0.s, z1.s +; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z6.s ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: uqadd z5.s, z2.s, z1.s ; CHECK-NEXT: uqadd z7.s, z3.s, z1.s @@ -177,25 +178,26 @@ define @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) { ; CHECK-NEXT: incw z3.s, all, mul #4 ; CHECK-NEXT: cmphi p5.s, p0/z, z25.s, z0.s ; CHECK-NEXT: incw z4.s, all, mul #2 -; CHECK-NEXT: cmphi p1.s, p0/z, z25.s, z5.s -; CHECK-NEXT: cmphi p3.s, p0/z, z25.s, z7.s ; CHECK-NEXT: uqadd z2.s, z2.s, z1.s ; CHECK-NEXT: uqadd z3.s, z3.s, z1.s +; CHECK-NEXT: cmphi p1.s, p0/z, z25.s, z5.s +; CHECK-NEXT: cmphi p3.s, p0/z, z25.s, z7.s ; CHECK-NEXT: uqadd z24.s, z4.s, z1.s ; CHECK-NEXT: incw z4.s, all, mul #4 -; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: cmphi p6.s, p0/z, z25.s, z2.s -; CHECK-NEXT: cmphi p2.s, p0/z, z25.s, z3.s +; CHECK-NEXT: cmphi p7.s, p0/z, z25.s, z3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: uqadd z1.s, z4.s, z1.s ; CHECK-NEXT: cmphi p4.s, p0/z, z25.s, z24.s -; CHECK-NEXT: uzp1 p3.h, p3.h, p4.h ; CHECK-NEXT: cmphi p0.s, p0/z, z25.s, z1.s -; CHECK-NEXT: uzp1 p4.h, p5.h, p6.h +; CHECK-NEXT: uzp1 p2.h, p3.h, p4.h +; CHECK-NEXT: uzp1 p3.h, p5.h, p6.h ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.h, p2.h, p0.h -; CHECK-NEXT: uzp1 p0.b, p1.b, p3.b -; CHECK-NEXT: uzp1 p1.b, p4.b, p2.b +; CHECK-NEXT: uzp1 p4.h, p7.h, p0.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -208,96 +210,97 @@ define @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) { ; CHECK-LABEL: lane_mask_nxv32i1_i64: ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG +; CHECK-NEXT: index z5.d, #0, #1 ; CHECK-NEXT: mov z0.d, x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, x1 -; CHECK-NEXT: mov z2.d, z1.d -; CHECK-NEXT: mov z4.d, z1.d -; CHECK-NEXT: mov z6.d, z1.d -; CHECK-NEXT: uqadd z25.d, z1.d, z0.d -; CHECK-NEXT: incd z1.d, all, mul #8 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, z5.d +; CHECK-NEXT: mov z1.d, z5.d +; CHECK-NEXT: mov z4.d, z5.d +; CHECK-NEXT: uqadd z25.d, z5.d, z0.d +; CHECK-NEXT: incd z5.d, all, mul #8 ; CHECK-NEXT: incd z2.d -; CHECK-NEXT: incd z4.d, all, mul #2 -; CHECK-NEXT: incd z6.d, all, mul #4 -; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z1.d, z1.d, z0.d -; CHECK-NEXT: mov z5.d, z2.d -; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: incd z1.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z25.d +; CHECK-NEXT: mov z6.d, z2.d ; CHECK-NEXT: mov z7.d, z2.d -; CHECK-NEXT: mov z24.d, z4.d -; CHECK-NEXT: uqadd z27.d, z4.d, z0.d -; CHECK-NEXT: uqadd z28.d, z6.d, z0.d +; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: uqadd z26.d, z2.d, z0.d +; CHECK-NEXT: uqadd z27.d, z1.d, z0.d +; CHECK-NEXT: uqadd z28.d, z4.d, z0.d ; CHECK-NEXT: incd z2.d, all, mul #8 +; CHECK-NEXT: incd z1.d, all, mul #8 ; CHECK-NEXT: incd z4.d, all, mul #8 -; CHECK-NEXT: incd z6.d, all, mul #8 -; CHECK-NEXT: incd z5.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #2 ; CHECK-NEXT: incd z7.d, all, mul #4 -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z26.d ; CHECK-NEXT: incd z24.d, all, mul #4 -; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z27.d -; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z28.d +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z26.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z27.d +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z28.d +; CHECK-NEXT: mov z31.d, z6.d +; CHECK-NEXT: uqadd z29.d, z6.d, z0.d +; CHECK-NEXT: uqadd z30.d, z7.d, z0.d +; CHECK-NEXT: uqadd z8.d, z24.d, z0.d +; CHECK-NEXT: incd z6.d, all, mul #8 +; CHECK-NEXT: incd z7.d, all, mul #8 +; CHECK-NEXT: incd z24.d, all, mul #8 ; CHECK-NEXT: uqadd z2.d, z2.d, z0.d +; CHECK-NEXT: uqadd z1.d, z1.d, z0.d +; CHECK-NEXT: incd z31.d, all, mul #4 ; CHECK-NEXT: uqadd z4.d, z4.d, z0.d +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z29.d +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z30.d ; CHECK-NEXT: uqadd z6.d, z6.d, z0.d -; CHECK-NEXT: mov z26.d, z5.d -; CHECK-NEXT: uqadd z25.d, z5.d, z0.d -; CHECK-NEXT: uqadd z27.d, z7.d, z0.d -; CHECK-NEXT: incd z5.d, all, mul #8 -; CHECK-NEXT: incd z7.d, all, mul #8 -; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s -; CHECK-NEXT: incd z26.d, all, mul #4 -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z2.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z25.d -; CHECK-NEXT: uqadd z25.d, z24.d, z0.d -; CHECK-NEXT: incd z24.d, all, mul #8 -; CHECK-NEXT: uqadd z5.d, z5.d, z0.d +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z8.d +; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: uqadd z7.d, z7.d, z0.d -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z27.d -; CHECK-NEXT: uqadd z28.d, z26.d, z0.d -; CHECK-NEXT: incd z26.d, all, mul #8 -; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: uqadd z25.d, z31.d, z0.d +; CHECK-NEXT: incd z31.d, all, mul #8 ; CHECK-NEXT: uqadd z24.d, z24.d, z0.d -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z25.d -; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z1.d -; CHECK-NEXT: uzp1 p5.s, p5.s, p6.s -; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z4.d -; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z5.d -; CHECK-NEXT: cmphi p10.d, p0/z, z3.d, z7.d -; CHECK-NEXT: uqadd z0.d, z26.d, z0.d -; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z28.d -; CHECK-NEXT: uzp1 p4.s, p4.s, p8.s -; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z24.d -; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: cmphi p4.d, p0/z, z3.d, z5.d +; CHECK-NEXT: uzp1 p2.s, p2.s, p5.s +; CHECK-NEXT: cmphi p5.d, p0/z, z3.d, z2.d +; CHECK-NEXT: cmphi p9.d, p0/z, z3.d, z6.d +; CHECK-NEXT: uqadd z0.d, z31.d, z0.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p7.s +; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z25.d +; CHECK-NEXT: uzp1 p2.h, p3.h, p2.h +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z7.d +; CHECK-NEXT: uzp1 p4.s, p4.s, p5.s +; CHECK-NEXT: uzp1 p5.s, p7.s, p9.s ; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h -; CHECK-NEXT: uzp1 p2.s, p7.s, p2.s -; CHECK-NEXT: cmphi p7.d, p0/z, z3.d, z6.d +; CHECK-NEXT: uzp1 p6.s, p6.s, p8.s +; CHECK-NEXT: cmphi p8.d, p0/z, z3.d, z4.d +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.h, p4.h, p5.h +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p1.h, p6.h +; CHECK-NEXT: cmphi p6.d, p0/z, z3.d, z24.d ; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z0.d -; CHECK-NEXT: uzp1 p7.s, p7.s, p10.s -; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p8.s, p0.s +; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p3.h, p4.h, p6.h +; CHECK-NEXT: uzp1 p0.s, p6.s, p0.s ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p2.h, p5.h, p2.h -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.h, p7.h, p0.h -; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.b, p1.b, p2.b -; CHECK-NEXT: uzp1 p1.b, p3.b, p4.b +; CHECK-NEXT: uzp1 p3.h, p3.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p2.b, p1.b +; CHECK-NEXT: uzp1 p1.b, p4.b, p3.b ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %active.lane.mask = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %TC) @@ -459,12 +462,12 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: adrp x8, .LCPI26_0 ; CHECK-NEXT: movi d2, #0xff00ff00ff00ff ; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI26_0] +; CHECK-NEXT: dup v3.4h, w1 ; CHECK-NEXT: bic v0.4h, #255, lsl #8 +; CHECK-NEXT: bic v3.4h, #255, lsl #8 ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: dup v1.4h, w1 ; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h -; CHECK-NEXT: bic v1.4h, #255, lsl #8 -; CHECK-NEXT: cmhi v0.4h, v1.4h, v0.4h +; CHECK-NEXT: cmhi v0.4h, v3.4h, v0.4h ; CHECK-NEXT: ret %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC) ret <4 x i1> %active.lane.mask @@ -480,9 +483,9 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) { ; CHECK-NEXT: dup v3.2s, w1 ; CHECK-NEXT: and v1.8b, v1.8b, v0.8b ; CHECK-NEXT: add v1.2s, v1.2s, v2.2s -; CHECK-NEXT: umin v1.2s, v1.2s, v0.2s -; CHECK-NEXT: and v0.8b, v3.8b, v0.8b -; CHECK-NEXT: cmhi v0.2s, v0.2s, v1.2s +; CHECK-NEXT: and v2.8b, v3.8b, v0.8b +; CHECK-NEXT: umin v0.2s, v1.2s, v0.2s +; CHECK-NEXT: cmhi v0.2s, v2.2s, v0.2s ; CHECK-NEXT: ret %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC) ret <2 x i1> %active.lane.mask diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 3007e7ce771e6..508f68d6f14d4 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -54,19 +54,19 @@ define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) { ; CHECK-LABEL: uitofp_v4i64_to_v4bf16: ; CHECK: // %bb.0: ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll index fe4da2e7cf36b..5b45ba2552cef 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll @@ -257,12 +257,12 @@ define i16 @uabd16b_rdx(ptr %a, ptr %b) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.8h v3, v1, v2 ; CHECK-GI-NEXT: usubl2.8h v1, v1, v2 -; CHECK-GI-NEXT: neg.8h v2, v3 -; CHECK-GI-NEXT: neg.8h v4, v1 -; CHECK-GI-NEXT: cmgt.8h v5, v0, v3 +; CHECK-GI-NEXT: cmgt.8h v2, v0, v3 ; CHECK-GI-NEXT: cmgt.8h v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.8h v4, v3 +; CHECK-GI-NEXT: neg.8h v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.8h v0, v2, v0 ; CHECK-GI-NEXT: addv.8h h0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -299,18 +299,18 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-GI-NEXT: usubl2.4s v3, v3, v4 ; CHECK-GI-NEXT: usubl.4s v4, v0, v1 ; CHECK-GI-NEXT: usubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: neg.4s v6, v5 -; CHECK-GI-NEXT: neg.4s v7, v3 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: neg.4s v17, v0 -; CHECK-GI-NEXT: cmgt.4s v18, v2, v3 -; CHECK-GI-NEXT: cmgt.4s v19, v2, v4 +; CHECK-GI-NEXT: cmgt.4s v6, v2, v3 +; CHECK-GI-NEXT: neg.4s v16, v5 +; CHECK-GI-NEXT: cmgt.4s v7, v2, v4 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bsl.16b v1, v6, v5 -; CHECK-GI-NEXT: bit.16b v3, v7, v18 -; CHECK-GI-NEXT: bit.16b v4, v16, v19 -; CHECK-GI-NEXT: bit.16b v0, v17, v2 +; CHECK-GI-NEXT: neg.4s v17, v3 +; CHECK-GI-NEXT: neg.4s v18, v4 +; CHECK-GI-NEXT: neg.4s v19, v0 +; CHECK-GI-NEXT: bsl.16b v1, v16, v5 +; CHECK-GI-NEXT: bit.16b v3, v17, v6 +; CHECK-GI-NEXT: bit.16b v4, v18, v7 +; CHECK-GI-NEXT: bit.16b v0, v19, v2 ; CHECK-GI-NEXT: add.4s v1, v1, v3 ; CHECK-GI-NEXT: add.4s v0, v4, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 @@ -347,18 +347,18 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) { ; CHECK-GI-NEXT: ssubl2.4s v3, v3, v4 ; CHECK-GI-NEXT: ssubl.4s v4, v0, v1 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: neg.4s v6, v5 -; CHECK-GI-NEXT: neg.4s v7, v3 ; CHECK-GI-NEXT: cmgt.4s v1, v2, v5 -; CHECK-GI-NEXT: neg.4s v16, v4 -; CHECK-GI-NEXT: neg.4s v17, v0 -; CHECK-GI-NEXT: cmgt.4s v18, v2, v3 -; CHECK-GI-NEXT: cmgt.4s v19, v2, v4 +; CHECK-GI-NEXT: cmgt.4s v6, v2, v3 +; CHECK-GI-NEXT: neg.4s v16, v5 +; CHECK-GI-NEXT: cmgt.4s v7, v2, v4 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bsl.16b v1, v6, v5 -; CHECK-GI-NEXT: bit.16b v3, v7, v18 -; CHECK-GI-NEXT: bit.16b v4, v16, v19 -; CHECK-GI-NEXT: bit.16b v0, v17, v2 +; CHECK-GI-NEXT: neg.4s v17, v3 +; CHECK-GI-NEXT: neg.4s v18, v4 +; CHECK-GI-NEXT: neg.4s v19, v0 +; CHECK-GI-NEXT: bsl.16b v1, v16, v5 +; CHECK-GI-NEXT: bit.16b v3, v17, v6 +; CHECK-GI-NEXT: bit.16b v4, v18, v7 +; CHECK-GI-NEXT: bit.16b v0, v19, v2 ; CHECK-GI-NEXT: add.4s v1, v1, v3 ; CHECK-GI-NEXT: add.4s v0, v4, v0 ; CHECK-GI-NEXT: add.4s v0, v1, v0 @@ -396,12 +396,12 @@ define i32 @uabd8h_rdx(ptr %a, ptr %b) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.4s v3, v1, v2 ; CHECK-GI-NEXT: usubl2.4s v1, v1, v2 -; CHECK-GI-NEXT: neg.4s v2, v3 -; CHECK-GI-NEXT: neg.4s v4, v1 -; CHECK-GI-NEXT: cmgt.4s v5, v0, v3 +; CHECK-GI-NEXT: cmgt.4s v2, v0, v3 ; CHECK-GI-NEXT: cmgt.4s v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.4s v4, v3 +; CHECK-GI-NEXT: neg.4s v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.4s v0, v2, v0 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -428,15 +428,15 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) { ; ; CHECK-GI-LABEL: sabd8h_rdx: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.4s v3, v0, v1 ; CHECK-GI-NEXT: ssubl2.4s v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 -; CHECK-GI-NEXT: neg.4s v1, v3 -; CHECK-GI-NEXT: neg.4s v4, v0 -; CHECK-GI-NEXT: cmgt.4s v5, v2, v3 +; CHECK-GI-NEXT: neg.4s v4, v3 +; CHECK-GI-NEXT: neg.4s v5, v0 +; CHECK-GI-NEXT: cmgt.4s v1, v2, v3 ; CHECK-GI-NEXT: cmgt.4s v2, v2, v0 -; CHECK-GI-NEXT: bif.16b v1, v3, v5 -; CHECK-GI-NEXT: bit.16b v0, v4, v2 +; CHECK-GI-NEXT: bsl.16b v1, v4, v3 +; CHECK-GI-NEXT: bit.16b v0, v5, v2 ; CHECK-GI-NEXT: add.4s v0, v1, v0 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -461,10 +461,10 @@ define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) { ; ; CHECK-GI-LABEL: uabdl4s_rdx_i32: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: usubl.4s v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.4s v1, v2, v0 ; CHECK-GI-NEXT: neg.4s v2, v0 -; CHECK-GI-NEXT: cmgt.4s v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: addv.4s s0, v0 ; CHECK-GI-NEXT: fmov w0, s0 @@ -499,12 +499,12 @@ define i64 @uabd4s_rdx(ptr %a, ptr %b, i32 %h) { ; CHECK-GI-NEXT: movi.2d v0, #0000000000000000 ; CHECK-GI-NEXT: usubl.2d v3, v1, v2 ; CHECK-GI-NEXT: usubl2.2d v1, v1, v2 -; CHECK-GI-NEXT: neg.2d v2, v3 -; CHECK-GI-NEXT: neg.2d v4, v1 -; CHECK-GI-NEXT: cmgt.2d v5, v0, v3 +; CHECK-GI-NEXT: cmgt.2d v2, v0, v3 ; CHECK-GI-NEXT: cmgt.2d v0, v0, v1 -; CHECK-GI-NEXT: bif.16b v2, v3, v5 -; CHECK-GI-NEXT: bsl.16b v0, v4, v1 +; CHECK-GI-NEXT: neg.2d v4, v3 +; CHECK-GI-NEXT: neg.2d v5, v1 +; CHECK-GI-NEXT: bsl.16b v2, v4, v3 +; CHECK-GI-NEXT: bsl.16b v0, v5, v1 ; CHECK-GI-NEXT: add.2d v0, v2, v0 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -531,15 +531,15 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) { ; ; CHECK-GI-LABEL: sabd4s_rdx: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.2d v3, v0, v1 ; CHECK-GI-NEXT: ssubl2.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 -; CHECK-GI-NEXT: neg.2d v1, v3 -; CHECK-GI-NEXT: neg.2d v4, v0 -; CHECK-GI-NEXT: cmgt.2d v5, v2, v3 +; CHECK-GI-NEXT: neg.2d v4, v3 +; CHECK-GI-NEXT: neg.2d v5, v0 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v3 ; CHECK-GI-NEXT: cmgt.2d v2, v2, v0 -; CHECK-GI-NEXT: bif.16b v1, v3, v5 -; CHECK-GI-NEXT: bit.16b v0, v4, v2 +; CHECK-GI-NEXT: bsl.16b v1, v4, v3 +; CHECK-GI-NEXT: bit.16b v0, v5, v2 ; CHECK-GI-NEXT: add.2d v0, v1, v0 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -564,10 +564,10 @@ define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) { ; ; CHECK-GI-LABEL: uabdl2d_rdx_i64: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: usubl.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v0 ; CHECK-GI-NEXT: neg.2d v2, v0 -; CHECK-GI-NEXT: cmgt.2d v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: addp.2d d0, v0 ; CHECK-GI-NEXT: fmov x0, d0 @@ -1796,10 +1796,10 @@ define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) { ; ; CHECK-GI-LABEL: uabd_i32: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi.2d v2, #0000000000000000 ; CHECK-GI-NEXT: ssubl.2d v0, v0, v1 -; CHECK-GI-NEXT: movi.2d v1, #0000000000000000 +; CHECK-GI-NEXT: cmgt.2d v1, v2, v0 ; CHECK-GI-NEXT: neg.2d v2, v0 -; CHECK-GI-NEXT: cmgt.2d v1, v1, v0 ; CHECK-GI-NEXT: bit.16b v0, v2, v1 ; CHECK-GI-NEXT: ret %aext = sext <2 x i32> %a to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index cafee32ada686..d4cc154ac6afc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -205,15 +205,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { ; GENERIC-LABEL: test_vcvt_bf16_f64: ; GENERIC: // %bb.0: ; GENERIC-NEXT: fcvtxn v0.2s, v0.2d -; GENERIC-NEXT: movi.4s v1, #127, msl #8 -; GENERIC-NEXT: movi.4s v2, #1 +; GENERIC-NEXT: movi.4s v1, #1 +; GENERIC-NEXT: movi.4s v2, #127, msl #8 ; GENERIC-NEXT: ushr.4s v3, v0, #16 -; GENERIC-NEXT: add.4s v1, v0, v1 -; GENERIC-NEXT: and.16b v2, v3, v2 -; GENERIC-NEXT: add.4s v1, v2, v1 -; GENERIC-NEXT: fcmeq.4s v2, v0, v0 +; GENERIC-NEXT: add.4s v2, v0, v2 +; GENERIC-NEXT: and.16b v1, v3, v1 +; GENERIC-NEXT: fcmeq.4s v3, v0, v0 ; GENERIC-NEXT: orr.4s v0, #64, lsl #16 -; GENERIC-NEXT: bit.16b v0, v1, v2 +; GENERIC-NEXT: add.4s v1, v1, v2 +; GENERIC-NEXT: bit.16b v0, v1, v3 ; GENERIC-NEXT: shrn.4h v0, v0, #16 ; GENERIC-NEXT: ret ; @@ -238,15 +238,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp { ; GISEL-LABEL: test_vcvt_bf16_f64: ; GISEL: // %bb.0: ; GISEL-NEXT: fcvtxn v0.2s, v0.2d -; GISEL-NEXT: movi.4s v1, #127, msl #8 -; GISEL-NEXT: movi.4s v2, #1 +; GISEL-NEXT: movi.4s v1, #1 +; GISEL-NEXT: movi.4s v2, #127, msl #8 ; GISEL-NEXT: ushr.4s v3, v0, #16 -; GISEL-NEXT: add.4s v1, v0, v1 -; GISEL-NEXT: and.16b v2, v3, v2 -; GISEL-NEXT: add.4s v1, v2, v1 -; GISEL-NEXT: fcmeq.4s v2, v0, v0 +; GISEL-NEXT: add.4s v2, v0, v2 +; GISEL-NEXT: and.16b v1, v3, v1 +; GISEL-NEXT: fcmeq.4s v3, v0, v0 ; GISEL-NEXT: orr.4s v0, #64, lsl #16 -; GISEL-NEXT: bit.16b v0, v1, v2 +; GISEL-NEXT: add.4s v1, v1, v2 +; GISEL-NEXT: bit.16b v0, v1, v3 ; GISEL-NEXT: shrn.4h v0, v0, #16 ; GISEL-NEXT: ret %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll index dda610e5dd3cb..e754f01daa2a9 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll @@ -903,10 +903,10 @@ define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl.2s v0, v0, #24 ; CHECK-NEXT: shl.2s v1, v1, #24 +; CHECK-NEXT: movi d2, #0x00ffff0000ffff ; CHECK-NEXT: sshr.2s v0, v0, #24 ; CHECK-NEXT: ssra.2s v0, v1, #24 -; CHECK-NEXT: movi d1, #0x00ffff0000ffff -; CHECK-NEXT: and.8b v0, v0, v1 +; CHECK-NEXT: and.8b v0, v0, v2 ; CHECK-NEXT: ushr.2s v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <2 x i8> %src1 to <2 x i16> @@ -968,10 +968,10 @@ define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) { ; CHECK: // %bb.0: ; CHECK-NEXT: shl.4h v0, v0, #8 ; CHECK-NEXT: shl.4h v1, v1, #8 +; CHECK-NEXT: movi.4h v2, #1 ; CHECK-NEXT: sshr.4h v0, v0, #8 ; CHECK-NEXT: ssra.4h v0, v1, #8 -; CHECK-NEXT: movi.4h v1, #1 -; CHECK-NEXT: add.4h v0, v0, v1 +; CHECK-NEXT: add.4h v0, v0, v2 ; CHECK-NEXT: ushr.4h v0, v0, #1 ; CHECK-NEXT: ret %zextsrc1 = sext <4 x i8> %src1 to <4 x i16> diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll index ebf5ce20d4ecc..86b1d5d195ffd 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll @@ -7,21 +7,22 @@ target triple = "aarch64-unknown-linux-gnu" define @mull_add( %a, %b, %c) { ; CHECK-LABEL: mull_add: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z6.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z7.d, z2.d, z3.d -; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z6.d, z0.d, z1.d +; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z2.d, z3.d +; CHECK-NEXT: uzp1 z2.d, z2.d, z3.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d -; CHECK-NEXT: fmul z2.d, z6.d, z7.d -; CHECK-NEXT: fmul z3.d, z0.d, z7.d -; CHECK-NEXT: fmad z0.d, p0/m, z1.d, z2.d -; CHECK-NEXT: fnmsb z1.d, p0/m, z6.d, z3.d -; CHECK-NEXT: uzp2 z2.d, z4.d, z5.d -; CHECK-NEXT: uzp1 z3.d, z4.d, z5.d -; CHECK-NEXT: fadd z2.d, z0.d, z2.d +; CHECK-NEXT: fmul z7.d, z0.d, z1.d +; CHECK-NEXT: fmul z1.d, z6.d, z1.d +; CHECK-NEXT: movprfx z3, z7 +; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z2.d +; CHECK-NEXT: fnmsb z0.d, p0/m, z2.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d +; CHECK-NEXT: uzp1 z2.d, z4.d, z5.d +; CHECK-NEXT: fadd z2.d, z2.d, z0.d ; CHECK-NEXT: fadd z1.d, z3.d, z1.d -; CHECK-NEXT: zip1 z0.d, z1.d, z2.d -; CHECK-NEXT: zip2 z1.d, z1.d, z2.d +; CHECK-NEXT: zip1 z0.d, z2.d, z1.d +; CHECK-NEXT: zip2 z1.d, z2.d, z1.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -49,21 +50,21 @@ entry: define @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -100,21 +101,21 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z4.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fsub z0.d, z25.d, z27.d ; CHECK-NEXT: fsub z1.d, z26.d, z24.d +; CHECK-NEXT: fsub z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -151,21 +152,21 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 ; CHECK-NEXT: fcmla z25.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z3.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z4.d, z6.d, #270 -; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 -; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: fadd z1.d, z26.d, z24.d +; CHECK-NEXT: fadd z0.d, z25.d, z27.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -206,8 +207,8 @@ define @mul_add_rot_mull( %a, @mul_add_rot_mull( %a, @mul_add_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -90,19 +90,19 @@ entry: define @mul_sub_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_sub_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -139,19 +139,19 @@ entry: define @mul_conj_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_conj_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 -; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 +; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90 ; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0 -; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270 -; CHECK-NEXT: mov z0.d, z25.d +; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270 ; CHECK-NEXT: mov z1.d, z24.d +; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: ret entry: %strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -188,24 +188,25 @@ entry: define @mul_add_rot_mull( %a, %b, %c, %d) { ; CHECK-LABEL: mul_add_rot_mull: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: uzp1 z24.d, z2.d, z3.d +; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d ; CHECK-NEXT: uzp2 z25.d, z0.d, z1.d -; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uzp1 z2.d, z2.d, z3.d ; CHECK-NEXT: uzp1 z0.d, z0.d, z1.d +; CHECK-NEXT: uzp2 z1.d, z4.d, z5.d ; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d -; CHECK-NEXT: fmul z1.d, z24.d, z25.d -; CHECK-NEXT: fmul z3.d, z2.d, z25.d -; CHECK-NEXT: uzp2 z25.d, z4.d, z5.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d ; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d -; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z25.d -; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z0.d -; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d -; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d -; CHECK-NEXT: fnmls z2.d, p0/m, z24.d, z0.d -; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z25.d +; CHECK-NEXT: fmul z3.d, z2.d, z25.d +; CHECK-NEXT: fmul z25.d, z24.d, z25.d +; CHECK-NEXT: fmla z3.d, p0/m, z24.d, z0.d +; CHECK-NEXT: movprfx z24, z25 +; CHECK-NEXT: fmla z24.d, p0/m, z26.d, z1.d +; CHECK-NEXT: movprfx z6, z24 +; CHECK-NEXT: fmla z6.d, p0/m, z5.d, z4.d +; CHECK-NEXT: fmla z3.d, p0/m, z26.d, z4.d +; CHECK-NEXT: fnmsb z2.d, p0/m, z0.d, z6.d +; CHECK-NEXT: fmsb z1.d, p0/m, z5.d, z3.d ; CHECK-NEXT: zip1 z0.d, z2.d, z1.d ; CHECK-NEXT: zip2 z1.d, z2.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll index 611cf44ea7ee8..cb285c05b2e86 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll @@ -16,9 +16,10 @@ define @complex_mul_v4f16( %a, @complex_mul_v8f16( %a, %b) { ; CHECK-LABEL: complex_mul_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #0 ; CHECK-NEXT: fcmla z2.h, p0/m, z1.h, z0.h, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -72,15 +73,15 @@ entry: define @complex_mul_v16f16( %a, %b) { ; CHECK-LABEL: complex_mul_v16f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z4.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 ; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #0 -; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90 +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #0 ; CHECK-NEXT: fcmla z4.h, p0/m, z3.h, z1.h, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.h, p0/m, z2.h, z0.h, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f16( %a) @@ -103,23 +104,23 @@ entry: define @complex_mul_v32f16( %a, %b) { ; CHECK-LABEL: complex_mul_v32f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z24.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0 ; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #0 ; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #0 ; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #0 -; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #0 +; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90 ; CHECK-NEXT: fcmla z25.h, p0/m, z4.h, z0.h, #90 ; CHECK-NEXT: fcmla z26.h, p0/m, z5.h, z1.h, #90 ; CHECK-NEXT: fcmla z27.h, p0/m, z6.h, z2.h, #90 -; CHECK-NEXT: fcmla z24.h, p0/m, z7.h, z3.h, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32f16( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll index 0f5e9a2202ddd..1e2afb78de1b0 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll @@ -7,8 +7,8 @@ target triple = "aarch64" define @complex_mul_v4f32( %a, %b) { ; CHECK-LABEL: complex_mul_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #0 ; CHECK-NEXT: fcmla z2.s, p0/m, z1.s, z0.s, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -34,15 +34,15 @@ entry: define @complex_mul_v8f32( %a, %b) { ; CHECK-LABEL: complex_mul_v8f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z4.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 ; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #0 -; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90 +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #0 ; CHECK-NEXT: fcmla z4.s, p0/m, z3.s, z1.s, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.s, p0/m, z2.s, z0.s, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %a) @@ -65,23 +65,23 @@ entry: define @complex_mul_v16f32( %a, %b) { ; CHECK-LABEL: complex_mul_v16f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z24.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0 ; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #0 ; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #0 ; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #0 -; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #0 +; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90 ; CHECK-NEXT: fcmla z25.s, p0/m, z4.s, z0.s, #90 ; CHECK-NEXT: fcmla z26.s, p0/m, z5.s, z1.s, #90 ; CHECK-NEXT: fcmla z27.s, p0/m, z6.s, z2.s, #90 -; CHECK-NEXT: fcmla z24.s, p0/m, z7.s, z3.s, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16f32( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll index 1fe554bdc616e..17a239a09a033 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll @@ -7,8 +7,8 @@ target triple = "aarch64" define @complex_mul_v2f64( %a, %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #0 ; CHECK-NEXT: fcmla z2.d, p0/m, z1.d, z0.d, #90 ; CHECK-NEXT: mov z0.d, z2.d @@ -34,15 +34,15 @@ entry: define @complex_mul_v4f64( %a, %b) { ; CHECK-LABEL: complex_mul_v4f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z5.d, z4.d -; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #0 -; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #90 +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #0 ; CHECK-NEXT: fcmla z4.d, p0/m, z3.d, z1.d, #90 -; CHECK-NEXT: mov z0.d, z5.d +; CHECK-NEXT: fcmla z5.d, p0/m, z2.d, z0.d, #90 ; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: mov z0.d, z5.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %a) @@ -65,23 +65,23 @@ entry: define @complex_mul_v8f64( %a, %b) { ; CHECK-LABEL: complex_mul_v8f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z24.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, z24.d ; CHECK-NEXT: mov z26.d, z24.d ; CHECK-NEXT: mov z27.d, z24.d +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #0 ; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #0 ; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #0 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #0 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #0 +; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #90 ; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z0.d, #90 ; CHECK-NEXT: fcmla z26.d, p0/m, z5.d, z1.d, #90 ; CHECK-NEXT: fcmla z27.d, p0/m, z6.d, z2.d, #90 -; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z3.d, #90 +; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: mov z0.d, z25.d ; CHECK-NEXT: mov z1.d, z26.d ; CHECK-NEXT: mov z2.d, z27.d -; CHECK-NEXT: mov z3.d, z24.d ; CHECK-NEXT: ret entry: %a.deinterleaved = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %a) diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll index 1b8a21b66ade9..07488b623b98d 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll @@ -16,8 +16,9 @@ define @complex_mul_v4i16( %a, This Inner Loop Header: Depth=1 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d ; CHECK-NEXT: add x13, x0, x8 ; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: whilelo p1.d, x12, x9 +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] ; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: add x12, x12, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13] +; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: whilelo p1.d, x12, x9 +; CHECK-NEXT: add x12, x12, x10 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -114,10 +114,10 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-LABEL: complex_mul_predicated_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x10 -; CHECK-NEXT: neg x11, x10 ; CHECK-NEXT: mov w12, #100 // =0x64 +; CHECK-NEXT: neg x11, x10 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: and x11, x11, x12 @@ -133,20 +133,20 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x9, x9, x10 ; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmp x11, x9 -; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: zip1 p1.d, p1.d, p1.d -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14] +; CHECK-NEXT: zip2 p1.d, p2.d, p2.d +; CHECK-NEXT: zip1 p2.d, p2.d, p2.d +; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] +; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p2/m, z7.d -; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -217,8 +217,8 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) { ; CHECK-LABEL: complex_mul_predicated_x2_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: whilelo p1.d, xzr, x10 ; CHECK-NEXT: mov x8, xzr @@ -236,19 +236,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-NEXT: mov z7.d, z0.d ; CHECK-NEXT: add x9, x9, x11 ; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p3.d, p1.d, p1.d -; CHECK-NEXT: zip1 p2.d, p1.d, p1.d -; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl] +; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0 +; CHECK-NEXT: zip2 p1.d, p2.d, p2.d +; CHECK-NEXT: zip1 p2.d, p2.d, p2.d +; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] ; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p3/m, z7.d +; CHECK-NEXT: mov z0.d, p1/m, z7.d +; CHECK-NEXT: whilelo p1.d, x9, x10 ; CHECK-NEXT: mov z1.d, p2/m, z6.d ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index 1696ac8709d40..664d99a3627b5 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -15,11 +15,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #100 // =0x64 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 @@ -101,18 +101,18 @@ exit.block: ; preds = %vector.body define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_nonzero_init_v2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov d0, #1.00000000 ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: mov w10, #100 // =0x64 -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: fmov d2, #2.00000000 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: neg x9, x9 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: rdvl x11, #2 -; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d ; CHECK-NEXT: mov z1.d, p0/m, z2.d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: zip2 z0.d, z1.d, z3.d @@ -190,12 +190,12 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: mov w10, #1000 // =0x3e8 +; CHECK-NEXT: neg x9, x9 ; CHECK-NEXT: rdvl x12, #2 +; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x9, x10 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d @@ -324,10 +324,10 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-LABEL: reduction_mix: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z2.d, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x9 -; CHECK-NEXT: neg x10, x9 ; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: neg x10, x9 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: and x10, x10, x11 ; CHECK-NEXT: rdvl x11, #2 @@ -349,8 +349,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia ; CHECK-NEXT: uaddv d2, p0, z2.d ; CHECK-NEXT: uzp2 z3.d, z1.d, z0.d ; CHECK-NEXT: uzp1 z1.d, z1.d, z0.d -; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d0, p0, z3.d +; CHECK-NEXT: fmov x8, d2 ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 killed $z1 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll index 742a7099559f7..17bf5ba6eb48b 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll @@ -8,8 +8,8 @@ target triple = "aarch64" define @complex_mul_const( %a, %b) { ; CHECK-LABEL: complex_mul_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov z7.d, #3.00000000 ; CHECK-NEXT: fmov z24.d, #11.00000000 ; CHECK-NEXT: mov z6.d, z4.d @@ -55,25 +55,25 @@ entry: define @complex_mul_non_const( %a, %b, [2 x double] %c) { ; CHECK-LABEL: complex_mul_non_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z6.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d5 killed $d5 def $z5 ; CHECK-NEXT: // kill: def $d4 killed $d4 def $z4 ; CHECK-NEXT: mov z5.d, d5 ; CHECK-NEXT: mov z4.d, d4 ; CHECK-NEXT: mov z24.d, z6.d ; CHECK-NEXT: mov z7.d, z6.d -; CHECK-NEXT: zip2 z25.d, z4.d, z5.d -; CHECK-NEXT: zip1 z4.d, z4.d, z5.d ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #0 ; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90 +; CHECK-NEXT: zip2 z1.d, z4.d, z5.d ; CHECK-NEXT: fcmla z7.d, p0/m, z0.d, z2.d, #90 +; CHECK-NEXT: zip1 z2.d, z4.d, z5.d ; CHECK-NEXT: mov z0.d, z6.d -; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z25.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 -; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z25.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z1.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z2.d, #0 +; CHECK-NEXT: fcmla z6.d, p0/m, z24.d, z1.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z2.d, #90 ; CHECK-NEXT: mov z1.d, z6.d ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll index 1c254f9ed935d..e6d5a2ac0fd79 100644 --- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll +++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll @@ -96,8 +96,8 @@ entry: define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 { ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov z0.s, #1.00000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll index 4a2e85c715f7a..83c7f73800af1 100644 --- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll +++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll @@ -9,8 +9,8 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-LABEL: allocno_reload_assign: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.b, #0 // =0x0 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z16.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: uunpklo z1.h, z0.b ; CHECK-NEXT: uunpkhi z0.h, z0.b @@ -48,12 +48,12 @@ define fastcc i8 @allocno_reload_assign() { ; CHECK-NEXT: punpklo p4.h, p3.b ; CHECK-NEXT: punpkhi p3.h, p3.b ; CHECK-NEXT: st1b { z2.d }, p4, [z16.d] -; CHECK-NEXT: punpklo p4.h, p2.b -; CHECK-NEXT: punpkhi p2.h, p2.b ; CHECK-NEXT: st1b { z3.d }, p3, [z16.d] -; CHECK-NEXT: punpklo p3.h, p4.b -; CHECK-NEXT: st1b { z4.d }, p3, [z16.d] -; CHECK-NEXT: punpkhi p3.h, p4.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: st1b { z4.d }, p4, [z16.d] ; CHECK-NEXT: st1b { z5.d }, p3, [z16.d] ; CHECK-NEXT: punpklo p3.h, p2.b ; CHECK-NEXT: punpkhi p2.h, p2.b diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll index 49ad3ae7d6290..6e13ae6feb66b 100644 --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll @@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; ALL-NEXT: sdiv x9, x9, x8 ; ALL-NEXT: sdiv x11, x11, x10 ; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: fmov d2, x9 ; ALL-NEXT: fmov d1, x8 ; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v2.d[1], x11 +; ALL-NEXT: str q2, [x0] ; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x9 -; ALL-NEXT: mov v1.d[1], x11 -; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = sdiv <2 x i64> %x, %y store <2 x i64> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll index 3bc50b2f03d83..650219e03b8a7 100644 --- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll @@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw ; ALL-NEXT: udiv x9, x9, x8 ; ALL-NEXT: udiv x11, x11, x10 ; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: fmov d2, x9 ; ALL-NEXT: fmov d1, x8 ; ALL-NEXT: mul x10, x11, x10 +; ALL-NEXT: mov v2.d[1], x11 +; ALL-NEXT: str q2, [x0] ; ALL-NEXT: mov v1.d[1], x10 ; ALL-NEXT: sub v0.2d, v0.2d, v1.2d -; ALL-NEXT: fmov d1, x9 -; ALL-NEXT: mov v1.d[1], x11 -; ALL-NEXT: str q1, [x0] ; ALL-NEXT: ret %div = udiv <2 x i64> %x, %y store <2 x i64> %div, ptr %divdst, align 16 diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll index dff4831330deb..bd9d9b99622e3 100644 --- a/llvm/test/CodeGen/AArch64/extbinopload.ll +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -502,9 +502,9 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b ; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4 ; CHECK-NEXT: ld1 { v7.s }[1], [x7] -; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b ; CHECK-NEXT: shll v0.4s, v4.4h, #16 ; CHECK-NEXT: shll2 v4.4s, v4.8h, #16 +; CHECK-NEXT: usubl v5.8h, v6.8b, v7.8b ; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: saddw2 v1.4s, v4.4s, v1.8h ; CHECK-NEXT: shll v6.4s, v5.4h, #16 @@ -647,10 +647,10 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s1, [x0] ; CHECK-NEXT: add x8, x3, #8 -; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: str s1, [x4] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ldp s0, s4, [x2] +; CHECK-NEXT: ldr s0, [x2] ; CHECK-NEXT: ushll v2.8h, v0.8b, #0 ; CHECK-NEXT: umov w9, v2.h[0] ; CHECK-NEXT: umov w10, v2.h[1] @@ -664,31 +664,32 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x9, x1, #4 ; CHECK-NEXT: mov v1.d[1], v2.d[0] ; CHECK-NEXT: mov v0.b[11], w10 -; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x10, x3, #12 ; CHECK-NEXT: bic v1.8h, #255, lsl #8 ; CHECK-NEXT: ld1 { v0.s }[3], [x3], #4 -; CHECK-NEXT: ldr s3, [x0, #12] -; CHECK-NEXT: ldp s2, s7, [x0, #4] -; CHECK-NEXT: ld1 { v4.s }[1], [x3] -; CHECK-NEXT: ldp s5, s6, [x2, #8] -; CHECK-NEXT: ld1 { v3.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: ld1 { v5.s }[1], [x8] -; CHECK-NEXT: ld1 { v6.s }[1], [x11] +; CHECK-NEXT: ldr s5, [x0, #4] +; CHECK-NEXT: ldp s2, s3, [x2, #4] +; CHECK-NEXT: ldr s7, [x2, #12] +; CHECK-NEXT: ldp s6, s4, [x0, #8] +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: ld1 { v7.s }[1], [x10] +; CHECK-NEXT: ld1 { v3.s }[1], [x8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3] ; CHECK-NEXT: add x8, x1, #8 -; CHECK-NEXT: ld1 { v7.s }[1], [x8] -; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b -; CHECK-NEXT: ushll v3.8h, v5.8b, #0 -; CHECK-NEXT: uaddl v4.8h, v4.8b, v6.8b -; CHECK-NEXT: uaddw v1.8h, v1.8h, v7.8b +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: uaddl v2.8h, v2.8b, v7.8b +; CHECK-NEXT: uaddl v4.8h, v5.8b, v4.8b +; CHECK-NEXT: uaddw v1.8h, v1.8h, v6.8b ; CHECK-NEXT: uaddw2 v5.8h, v3.8h, v0.16b -; CHECK-NEXT: ushll v0.4s, v2.4h, #3 +; CHECK-NEXT: ushll v6.4s, v2.4h, #3 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #3 -; CHECK-NEXT: ushll v6.4s, v4.4h, #3 +; CHECK-NEXT: ushll v0.4s, v4.4h, #3 ; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v1.8h -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v5.8h +; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v1.8h +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v5.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v5.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p @@ -762,35 +763,35 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: add x8, x1, #8 ; CHECK-NEXT: ldr s6, [x1, #12] ; CHECK-NEXT: ldp s17, s18, [x2, #8] -; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ldp s3, s5, [x2] ; CHECK-NEXT: add x9, x3, #8 ; CHECK-NEXT: mov v4.16b, v1.16b ; CHECK-NEXT: ldp s7, s16, [x0] -; CHECK-NEXT: ldr s5, [x3, #12] +; CHECK-NEXT: ldr s2, [x3, #12] ; CHECK-NEXT: mov v1.s[1], v6.s[0] -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v3.s }[1], [x3], #4 ; CHECK-NEXT: mov v4.s[1], v6.s[0] ; CHECK-NEXT: ld1 { v7.s }[1], [x1], #4 ; CHECK-NEXT: ld1 { v16.s }[1], [x1] -; CHECK-NEXT: ld1 { v3.s }[1], [x3] -; CHECK-NEXT: ld1 { v0.s }[1], [x8] +; CHECK-NEXT: ld1 { v5.s }[1], [x3] ; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v0.s }[1], [x8] ; CHECK-NEXT: mov v4.s[2], v18.s[0] -; CHECK-NEXT: mov v18.s[1], v5.s[0] +; CHECK-NEXT: mov v18.s[1], v2.s[0] ; CHECK-NEXT: uaddl v1.8h, v16.8b, v1.8b ; CHECK-NEXT: uaddl v6.8h, v7.8b, v0.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v17.8b -; CHECK-NEXT: uaddl v3.8h, v3.8b, v18.8b +; CHECK-NEXT: uaddl v7.8h, v3.8b, v17.8b ; CHECK-NEXT: ushll v0.4s, v1.4h, #3 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #3 -; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: uaddl v5.8h, v5.8b, v18.8b +; CHECK-NEXT: mov v4.s[3], v2.s[0] ; CHECK-NEXT: uaddw v0.4s, v0.4s, v6.4h ; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v6.8h -; CHECK-NEXT: ushll v7.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 +; CHECK-NEXT: ushll v16.4s, v5.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v5.8h, #3 ; CHECK-NEXT: str q4, [x4] -; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h -; CHECK-NEXT: uaddw v2.4s, v7.4s, v2.4h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v7.8h +; CHECK-NEXT: uaddw v2.4s, v16.4s, v7.4h ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -873,8 +874,8 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v4.s }[1], [x11] ; CHECK-NEXT: ld1 { v2.s }[1], [x3] ; CHECK-NEXT: ld1 { v0.s }[1], [x10] -; CHECK-NEXT: ld1 { v7.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] ; CHECK-NEXT: uaddl v5.8h, v5.8b, v4.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v0.8b ; CHECK-NEXT: ushll v16.8h, v0.8b, #0 @@ -972,8 +973,8 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b ; CHECK-NEXT: uaddl v1.8h, v0.8b, v4.8b @@ -1072,23 +1073,23 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-NEXT: ld1 { v6.s }[1], [x9] ; CHECK-NEXT: ld1 { v4.s }[1], [x8] ; CHECK-NEXT: uaddl v7.8h, v3.8b, v7.8b -; CHECK-NEXT: uaddl v3.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b ; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b ; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b ; CHECK-NEXT: ushll v0.4s, v7.4h, #3 -; CHECK-NEXT: ushll2 v1.4s, v7.8h, #3 -; CHECK-NEXT: ushll v5.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v6.4s, v3.8h, #3 -; CHECK-NEXT: ushll2 v16.4s, v3.8h, #0 -; CHECK-NEXT: ushll v17.4s, v3.4h, #0 -; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h +; CHECK-NEXT: ushll2 v3.4s, v7.8h, #3 +; CHECK-NEXT: ushll v6.4s, v1.4h, #3 +; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v5.4s, v1.8h, #0 +; CHECK-NEXT: ushll v17.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v7.8h, #0 +; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v2.8h ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h -; CHECK-NEXT: uaddw2 v3.4s, v6.4s, v4.8h -; CHECK-NEXT: ushll2 v4.4s, v7.8h, #0 -; CHECK-NEXT: ushll v5.4s, v7.4h, #0 -; CHECK-NEXT: stp q17, q16, [x4, #32] -; CHECK-NEXT: stp q5, q4, [x4] +; CHECK-NEXT: uaddw v2.4s, v6.4s, v4.4h +; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v4.8h +; CHECK-NEXT: ushll v4.4s, v7.4h, #0 +; CHECK-NEXT: stp q17, q5, [x4, #32] +; CHECK-NEXT: stp q4, q18, [x4] ; CHECK-NEXT: ret %lp1 = load <4 x i8>, ptr %p %p2 = getelementptr i8, ptr %p, i32 4 @@ -1157,32 +1158,32 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { ; CHECK-LABEL: extrause_shl: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s1, s2, [x0] ; CHECK-NEXT: add x10, x3, #12 -; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ldp s0, s3, [x2] ; CHECK-NEXT: add x11, x1, #12 ; CHECK-NEXT: ldp s4, s5, [x0, #8] ; CHECK-NEXT: add x8, x3, #8 ; CHECK-NEXT: ldp s6, s7, [x2, #8] ; CHECK-NEXT: add x9, x1, #8 -; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 -; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 -; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1] ; CHECK-NEXT: ld1 { v5.s }[1], [x11] ; CHECK-NEXT: ld1 { v3.s }[1], [x3] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] ; CHECK-NEXT: ld1 { v4.s }[1], [x9] ; CHECK-NEXT: ld1 { v6.s }[1], [x8] -; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v5.8b ; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b -; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b -; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b -; CHECK-NEXT: ushll v5.4s, v1.4h, #3 +; CHECK-NEXT: uaddl v4.8h, v1.8b, v4.8b +; CHECK-NEXT: ushll v5.4s, v2.4h, #3 +; CHECK-NEXT: ushll2 v7.4s, v2.8h, #3 +; CHECK-NEXT: uaddl v2.8h, v0.8b, v6.8b ; CHECK-NEXT: ushll v6.4s, v3.4h, #3 -; CHECK-NEXT: ushll2 v7.4s, v1.8h, #3 ; CHECK-NEXT: ushll2 v16.4s, v3.8h, #3 -; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h ; CHECK-NEXT: uaddw2 v1.4s, v7.4s, v4.8h +; CHECK-NEXT: uaddw v0.4s, v5.4s, v4.4h ; CHECK-NEXT: stp q5, q7, [x4] ; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v2.8h ; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll index 9916aeeab1cad..b1ca88975a621 100644 --- a/llvm/test/CodeGen/AArch64/fcmp.ll +++ b/llvm/test/CodeGen/AArch64/fcmp.ll @@ -280,10 +280,10 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x ; CHECK-GI-NEXT: uzp1 v0.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: fmov s2, w8 ; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: neg v3.4s, v1.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s ; CHECK-GI-NEXT: mov v2.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v2.16b ; CHECK-GI-NEXT: and v0.16b, v6.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v7.16b, v1.16b @@ -348,10 +348,10 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d, ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b @@ -426,10 +426,10 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b @@ -545,8 +545,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-NOFP16-LABEL: v7f16_half: ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: mov w8, #15 // =0xf -; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[4] -; CHECK-GI-NOFP16-NEXT: mov h7, v0.h[5] +; CHECK-GI-NOFP16-NEXT: mov h5, v0.h[4] +; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[5] ; CHECK-GI-NOFP16-NEXT: fmov s4, w8 ; CHECK-GI-NOFP16-NEXT: mov h16, v1.h[4] ; CHECK-GI-NOFP16-NEXT: mov w8, #65535 // =0xffff @@ -555,32 +555,32 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-NOFP16-NEXT: mov h19, v1.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h -; CHECK-GI-NOFP16-NEXT: mov v5.16b, v4.16b -; CHECK-GI-NOFP16-NEXT: mov v6.h[1], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fmov s7, w8 +; CHECK-GI-NOFP16-NEXT: mov v7.16b, v4.16b +; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v6.h[0] +; CHECK-GI-NOFP16-NEXT: fmov s6, w8 ; CHECK-GI-NOFP16-NEXT: mov v16.h[1], v17.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[1], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.16b, v7.16b +; CHECK-GI-NOFP16-NEXT: mov v7.h[1], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.16b, v6.16b ; CHECK-GI-NOFP16-NEXT: fcmgt v0.4s, v1.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v18.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v18.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[1], v6.h[0] ; CHECK-GI-NOFP16-NEXT: mov v16.h[2], v19.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v4.h[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v6.4s, v6.4h -; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[2], v4.h[0] +; CHECK-GI-NOFP16-NEXT: fcvtl v5.4s, v5.4h +; CHECK-GI-NOFP16-NEXT: mov v17.h[2], v6.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v16.4s, v16.4h -; CHECK-GI-NOFP16-NEXT: mov v5.h[3], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v7.h[0] -; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v6.4s -; CHECK-GI-NOFP16-NEXT: mov v5.h[4], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v7.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[3], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[3], v6.h[0] +; CHECK-GI-NOFP16-NEXT: fcmgt v1.4s, v16.4s, v5.4s +; CHECK-GI-NOFP16-NEXT: mov v7.h[4], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[4], v6.h[0] ; CHECK-GI-NOFP16-NEXT: uzp1 v0.8h, v0.8h, v1.8h -; CHECK-GI-NOFP16-NEXT: mov v5.h[5], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v7.h[0] -; CHECK-GI-NOFP16-NEXT: mov v5.h[6], v4.h[0] -; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v7.h[0] -; CHECK-GI-NOFP16-NEXT: neg v1.8h, v5.8h -; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v5.8h +; CHECK-GI-NOFP16-NEXT: mov v7.h[5], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[5], v6.h[0] +; CHECK-GI-NOFP16-NEXT: mov v7.h[6], v4.h[0] +; CHECK-GI-NOFP16-NEXT: mov v17.h[6], v6.h[0] +; CHECK-GI-NOFP16-NEXT: ushl v0.8h, v0.8h, v7.8h +; CHECK-GI-NOFP16-NEXT: neg v1.8h, v7.8h ; CHECK-GI-NOFP16-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-GI-NOFP16-NEXT: eor v1.16b, v0.16b, v17.16b ; CHECK-GI-NOFP16-NEXT: and v0.16b, v2.16b, v0.16b @@ -609,8 +609,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x ; CHECK-GI-FP16-NEXT: mov v7.h[5], v6.h[0] ; CHECK-GI-FP16-NEXT: mov v5.h[6], v4.h[0] ; CHECK-GI-FP16-NEXT: mov v7.h[6], v6.h[0] -; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h ; CHECK-GI-FP16-NEXT: ushl v0.8h, v0.8h, v5.8h +; CHECK-GI-FP16-NEXT: neg v1.8h, v5.8h ; CHECK-GI-FP16-NEXT: sshl v0.8h, v0.8h, v1.8h ; CHECK-GI-FP16-NEXT: eor v1.16b, v0.16b, v7.16b ; CHECK-GI-FP16-NEXT: and v0.16b, v2.16b, v0.16b @@ -1047,6 +1047,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov h6, v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov h7, v1.h[6] ; CHECK-GI-NOFP16-NEXT: fmov s16, w0 +; CHECK-GI-NOFP16-NEXT: fmov s18, w4 ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v3.h[0] @@ -1054,6 +1055,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v4.h[1], v5.h[0] ; CHECK-GI-NOFP16-NEXT: ldr s5, [sp] ; CHECK-GI-NOFP16-NEXT: mov v16.s[1], w1 +; CHECK-GI-NOFP16-NEXT: mov v18.s[1], w5 ; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-NOFP16-NEXT: fmov w9, s5 ; CHECK-GI-NOFP16-NEXT: fmov s5, w7 @@ -1069,27 +1071,25 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w8 ; CHECK-GI-NOFP16-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NOFP16-NEXT: mov v7.s[1], v17.s[0] -; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: ldr s17, [sp, #40] +; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h +; CHECK-GI-NOFP16-NEXT: mov v18.s[2], w6 ; CHECK-GI-NOFP16-NEXT: fcvtl v4.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: mov v16.s[3], w3 ; CHECK-GI-NOFP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-NOFP16-NEXT: neg v18.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: mov v7.s[2], v17.s[0] ; CHECK-GI-NOFP16-NEXT: fcmgt v2.4s, v4.4s, v2.4s ; CHECK-GI-NOFP16-NEXT: fmov s4, w8 ; CHECK-GI-NOFP16-NEXT: mov v4.s[1], w8 ; CHECK-GI-NOFP16-NEXT: ushl v2.4s, v2.4s, v3.4s -; CHECK-GI-NOFP16-NEXT: fmov s3, w4 -; CHECK-GI-NOFP16-NEXT: mov v3.s[1], w5 +; CHECK-GI-NOFP16-NEXT: neg v3.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: mov v4.s[2], w8 -; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v18.4s +; CHECK-GI-NOFP16-NEXT: sshl v2.4s, v2.4s, v3.4s ; CHECK-GI-NOFP16-NEXT: fmov w8, s6 -; CHECK-GI-NOFP16-NEXT: mov v3.s[2], w6 -; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b ; CHECK-GI-NOFP16-NEXT: mov v5.s[3], w8 +; CHECK-GI-NOFP16-NEXT: eor v1.16b, v2.16b, v4.16b +; CHECK-GI-NOFP16-NEXT: and v2.16b, v18.16b, v2.16b ; CHECK-GI-NOFP16-NEXT: and v1.16b, v7.16b, v1.16b -; CHECK-GI-NOFP16-NEXT: and v2.16b, v3.16b, v2.16b ; CHECK-GI-NOFP16-NEXT: bsl v0.16b, v16.16b, v5.16b ; CHECK-GI-NOFP16-NEXT: orr v1.16b, v2.16b, v1.16b ; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] @@ -1111,30 +1111,32 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: fcmgt v0.8h, v1.8h, v0.8h ; CHECK-GI-FP16-NEXT: mov w10, #31 // =0x1f ; CHECK-GI-FP16-NEXT: ldr s3, [sp] -; CHECK-GI-FP16-NEXT: fmov s1, w10 +; CHECK-GI-FP16-NEXT: fmov s2, w10 ; CHECK-GI-FP16-NEXT: fmov s6, w0 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #8] +; CHECK-GI-FP16-NEXT: fmov s17, w4 ; CHECK-GI-FP16-NEXT: ldr s7, [sp, #24] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #32] ; CHECK-GI-FP16-NEXT: umov w8, v0.h[4] ; CHECK-GI-FP16-NEXT: umov w9, v0.h[5] -; CHECK-GI-FP16-NEXT: mov v1.s[1], w10 +; CHECK-GI-FP16-NEXT: mov v2.s[1], w10 ; CHECK-GI-FP16-NEXT: mov v6.s[1], w1 +; CHECK-GI-FP16-NEXT: mov v17.s[1], w5 ; CHECK-GI-FP16-NEXT: mov v7.s[1], v16.s[0] ; CHECK-GI-FP16-NEXT: ldr s16, [sp, #40] -; CHECK-GI-FP16-NEXT: fmov s2, w8 +; CHECK-GI-FP16-NEXT: fmov s1, w8 ; CHECK-GI-FP16-NEXT: umov w8, v0.h[6] -; CHECK-GI-FP16-NEXT: mov v1.s[2], w10 +; CHECK-GI-FP16-NEXT: mov v2.s[2], w10 ; CHECK-GI-FP16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-GI-FP16-NEXT: mov v6.s[2], w2 +; CHECK-GI-FP16-NEXT: mov v17.s[2], w6 ; CHECK-GI-FP16-NEXT: mov v7.s[2], v16.s[0] -; CHECK-GI-FP16-NEXT: mov v2.s[1], w9 +; CHECK-GI-FP16-NEXT: mov v1.s[1], w9 ; CHECK-GI-FP16-NEXT: mov w9, #-1 // =0xffffffff ; CHECK-GI-FP16-NEXT: fmov s5, w9 -; CHECK-GI-FP16-NEXT: neg v17.4s, v1.4s ; CHECK-GI-FP16-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-GI-FP16-NEXT: mov v6.s[3], w3 -; CHECK-GI-FP16-NEXT: mov v2.s[2], w8 +; CHECK-GI-FP16-NEXT: mov v1.s[2], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s3 ; CHECK-GI-FP16-NEXT: fmov s3, w7 ; CHECK-GI-FP16-NEXT: mov v5.s[1], w9 @@ -1142,26 +1144,24 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32 ; CHECK-GI-FP16-NEXT: mov v3.s[1], w8 ; CHECK-GI-FP16-NEXT: fmov w8, s4 ; CHECK-GI-FP16-NEXT: ldr s4, [sp, #16] -; CHECK-GI-FP16-NEXT: ushl v1.4s, v2.4s, v1.4s -; CHECK-GI-FP16-NEXT: fmov s2, w4 +; CHECK-GI-FP16-NEXT: ushl v1.4s, v1.4s, v2.4s +; CHECK-GI-FP16-NEXT: neg v2.4s, v2.4s ; CHECK-GI-FP16-NEXT: mov v5.s[2], w9 -; CHECK-GI-FP16-NEXT: mov v2.s[1], w5 ; CHECK-GI-FP16-NEXT: mov v3.s[2], w8 -; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v17.4s +; CHECK-GI-FP16-NEXT: sshl v1.4s, v1.4s, v2.4s ; CHECK-GI-FP16-NEXT: fmov w8, s4 -; CHECK-GI-FP16-NEXT: eor v4.16b, v1.16b, v5.16b -; CHECK-GI-FP16-NEXT: mov v2.s[2], w6 +; CHECK-GI-FP16-NEXT: eor v2.16b, v1.16b, v5.16b +; CHECK-GI-FP16-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-GI-FP16-NEXT: mov v3.s[3], w8 -; CHECK-GI-FP16-NEXT: and v1.16b, v2.16b, v1.16b -; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v4.16b +; CHECK-GI-FP16-NEXT: and v2.16b, v7.16b, v2.16b ; CHECK-GI-FP16-NEXT: bsl v0.16b, v6.16b, v3.16b ; CHECK-GI-FP16-NEXT: orr v1.16b, v1.16b, v2.16b ; CHECK-GI-FP16-NEXT: mov s2, v0.s[1] ; CHECK-GI-FP16-NEXT: mov s3, v0.s[2] ; CHECK-GI-FP16-NEXT: mov s4, v0.s[3] -; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: mov s5, v1.s[1] ; CHECK-GI-FP16-NEXT: mov s6, v1.s[2] +; CHECK-GI-FP16-NEXT: fmov w0, s0 ; CHECK-GI-FP16-NEXT: fmov w4, s1 ; CHECK-GI-FP16-NEXT: fmov w1, s2 ; CHECK-GI-FP16-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll index 1ed63f3ef2507..0627250d07791 100644 --- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll +++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll @@ -171,8 +171,8 @@ entry: define @splat_fdiv_nxv2f64(double %D, %a) #1 { ; CHECK-LABEL: splat_fdiv_nxv2f64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll index 03e64f8b785b0..a78addc490086 100644 --- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll @@ -604,8 +604,8 @@ define fastcc i1 @quantum_hadamard(i32 %0) { define @fdiv_pow2_nx4xfloat( %i) "target-features"="+sve" { ; CHECK-LABEL: fdiv_pow2_nx4xfloat: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmov z1.s, #9.00000000 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s diff --git a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll index 67c056c780cc8..2c8e2190f8209 100644 --- a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll +++ b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll @@ -62,9 +62,9 @@ define @frem_nxv4f32( %unused, @frem_nxv4f32( %unused, @frem_strict_nxv2f64( %unused, ; ARMPL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; ARMPL-NEXT: .cfi_def_cfa_offset 16 ; ARMPL-NEXT: .cfi_offset w30, -16 -; ARMPL-NEXT: ptrue p0.d ; ARMPL-NEXT: mov z0.d, z1.d ; ARMPL-NEXT: mov z1.d, z2.d +; ARMPL-NEXT: ptrue p0.d ; ARMPL-NEXT: bl armpl_svfmod_f64_x ; ARMPL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; ARMPL-NEXT: ret @@ -102,9 +102,9 @@ define @frem_strict_nxv2f64( %unused, ; SLEEF-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; SLEEF-NEXT: .cfi_def_cfa_offset 16 ; SLEEF-NEXT: .cfi_offset w30, -16 -; SLEEF-NEXT: ptrue p0.d ; SLEEF-NEXT: mov z0.d, z1.d ; SLEEF-NEXT: mov z1.d, z2.d +; SLEEF-NEXT: ptrue p0.d ; SLEEF-NEXT: bl _ZGVsMxvv_fmod ; SLEEF-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; SLEEF-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll index 301d28fd7be56..2ea581359af6f 100644 --- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll @@ -194,10 +194,10 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -833,10 +833,10 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fcvtzs v0.2d, v0.2d ; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: movi v2.2d, #0000000000000000 ; CHECK-NEXT: xtn v0.2s, v0.2d ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0000000000000000 -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index 92fd3183393ea..c45885a38f159 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: fcvtzs v0.2s, v0.2s +; CHECK-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s -; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s +; CHECK-NEXT: smax v0.2s, v0.2s, v2.2s ; CHECK-NEXT: ret %x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f) ret <2 x i1> %x @@ -1620,9 +1620,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000 ; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h +; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v2.4h ; CHECK-FP16-NEXT: ret %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f) ret <4 x i1> %x @@ -1668,9 +1668,9 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: fcvtzs v0.4h, v0.4h ; CHECK-FP16-NEXT: mvni v1.4h, #240, lsl #8 +; CHECK-FP16-NEXT: movi v2.4h, #240, lsl #8 ; CHECK-FP16-NEXT: smin v0.4h, v0.4h, v1.4h -; CHECK-FP16-NEXT: movi v1.4h, #240, lsl #8 -; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v1.4h +; CHECK-FP16-NEXT: smax v0.4h, v0.4h, v2.4h ; CHECK-FP16-NEXT: ret %x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f) ret <4 x i13> %x @@ -2103,9 +2103,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: movi v1.2d, #0000000000000000 ; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h +; CHECK-FP16-NEXT: movi v2.2d, #0xffffffffffffffff ; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h -; CHECK-FP16-NEXT: movi v1.2d, #0xffffffffffffffff -; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h +; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v2.8h ; CHECK-FP16-NEXT: xtn v0.8b, v0.8h ; CHECK-FP16-NEXT: ret %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f) @@ -2254,9 +2254,9 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) { ; CHECK-FP16: // %bb.0: ; CHECK-FP16-NEXT: fcvtzs v0.8h, v0.8h ; CHECK-FP16-NEXT: mvni v1.8h, #240, lsl #8 +; CHECK-FP16-NEXT: movi v2.8h, #240, lsl #8 ; CHECK-FP16-NEXT: smin v0.8h, v0.8h, v1.8h -; CHECK-FP16-NEXT: movi v1.8h, #240, lsl #8 -; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v1.8h +; CHECK-FP16-NEXT: smax v0.8h, v0.8h, v2.8h ; CHECK-FP16-NEXT: ret %x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f) ret <8 x i13> %x diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll index 181f2185893e4..d39c09524e1ad 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll @@ -78,9 +78,9 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) { ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.4s, #31 ; CHECK-NEXT: neg v3.4s, v1.4s +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: neg v2.4s, v2.4s +; CHECK-NEXT: neg v2.4s, v3.4s ; CHECK-NEXT: ushl v1.4s, v0.4s, v1.4s ; CHECK-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll index 97511639ec8cf..cb9f04a7fac48 100644 --- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -202,8 +202,8 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind { ; CHECK-LABEL: vec_4xi32_nonsplat_eq: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: neg v1.4s, v1.4s ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: ushl v1.4s, v2.4s, v1.4s ; CHECK-NEXT: and v0.16b, v1.16b, v0.16b diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll index e7352fe03d01a..8e10847e7aae3 100644 --- a/llvm/test/CodeGen/AArch64/icmp.ll +++ b/llvm/test/CodeGen/AArch64/icmp.ll @@ -179,10 +179,10 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> ; CHECK-GI-NEXT: mov w8, #-1 // =0xffffffff ; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: neg v5.4s, v4.4s ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v4.4s +; CHECK-GI-NEXT: neg v4.4s, v4.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mov v1.s[2], w8 -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v5.4s ; CHECK-GI-NEXT: eor v1.16b, v0.16b, v1.16b ; CHECK-GI-NEXT: and v0.16b, v2.16b, v0.16b ; CHECK-GI-NEXT: and v1.16b, v3.16b, v1.16b diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index e4d2b516b8fbf..0b730f6e77156 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -64,104 +64,104 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca ; CHECK-NEXT: ldr d3, [x11] ; CHECK-NEXT: ldr d4, [x10, x8] ; CHECK-NEXT: ldr d5, [x11, x9] +; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b -; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: shll2 v4.4s, v1.8h, #16 +; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h ; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: shll2 v4.4s, v3.8h, #16 -; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v4.4s, v3.4h +; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 +; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h ; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h ; CHECK-NEXT: rev64 v5.4s, v1.4s ; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s ; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s ; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s ; CHECK-NEXT: mov v6.s[1], v7.s[0] ; CHECK-NEXT: ext v7.16b, v5.16b, v16.16b, #8 ; CHECK-NEXT: mov v5.s[3], v4.s[2] -; CHECK-NEXT: uzp1 v4.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v1.4s, v1.4s, v0.4s +; CHECK-NEXT: ext v4.16b, v2.16b, v2.16b, #8 ; CHECK-NEXT: mov v6.d[1], v7.d[1] ; CHECK-NEXT: mov v3.d[1], v5.d[1] +; CHECK-NEXT: uzp1 v1.4s, v4.4s, v0.4s +; CHECK-NEXT: uzp2 v4.4s, v4.4s, v0.4s ; CHECK-NEXT: uzp2 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v4.4s, v1.4s ; CHECK-NEXT: add v2.4s, v3.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: rev64 v6.4s, v1.4s ; CHECK-NEXT: rev64 v4.4s, v2.4s ; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: addp v16.4s, v1.4s, v3.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s ; CHECK-NEXT: rev64 v7.4s, v0.4s +; CHECK-NEXT: addp v16.4s, v1.4s, v3.4s ; CHECK-NEXT: addp v17.4s, v0.4s, v2.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s ; CHECK-NEXT: zip1 v18.4s, v17.4s, v17.4s -; CHECK-NEXT: ext v6.16b, v1.16b, v16.16b, #8 ; CHECK-NEXT: ext v4.16b, v17.16b, v2.16b, #4 ; CHECK-NEXT: ext v5.16b, v16.16b, v3.16b, #4 ; CHECK-NEXT: mov v20.16b, v3.16b +; CHECK-NEXT: ext v6.16b, v1.16b, v16.16b, #8 ; CHECK-NEXT: ext v7.16b, v0.16b, v17.16b, #4 ; CHECK-NEXT: mov v21.16b, v2.16b ; CHECK-NEXT: trn2 v0.4s, v18.4s, v0.4s -; CHECK-NEXT: ext v19.16b, v6.16b, v1.16b, #4 -; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: mov v20.s[2], v16.s[3] ; CHECK-NEXT: zip2 v4.4s, v4.4s, v17.4s ; CHECK-NEXT: zip2 v5.4s, v5.4s, v16.4s ; CHECK-NEXT: mov v21.s[2], v17.s[3] +; CHECK-NEXT: ext v19.16b, v6.16b, v1.16b, #4 ; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: mov v18.16b, v1.16b +; CHECK-NEXT: mov v1.s[2], v16.s[1] ; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #12 ; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #12 ; CHECK-NEXT: uzp2 v4.4s, v6.4s, v19.4s ; CHECK-NEXT: mov v5.16b, v7.16b ; CHECK-NEXT: mov v6.16b, v20.16b +; CHECK-NEXT: mov v18.16b, v1.16b ; CHECK-NEXT: mov v19.16b, v21.16b -; CHECK-NEXT: mov v18.s[1], v16.s[0] ; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s ; CHECK-NEXT: mov v6.s[1], v16.s[2] ; CHECK-NEXT: mov v5.s[0], v17.s[1] +; CHECK-NEXT: mov v18.s[1], v16.s[0] ; CHECK-NEXT: mov v19.s[1], v17.s[2] ; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: sub v16.4s, v20.4s, v3.4s ; CHECK-NEXT: sub v17.4s, v21.4s, v2.4s -; CHECK-NEXT: add v4.4s, v18.4s, v4.4s ; CHECK-NEXT: add v3.4s, v6.4s, v3.4s ; CHECK-NEXT: add v0.4s, v0.4s, v5.4s +; CHECK-NEXT: add v4.4s, v18.4s, v4.4s ; CHECK-NEXT: add v2.4s, v19.4s, v2.4s -; CHECK-NEXT: mov v4.d[1], v1.d[1] ; CHECK-NEXT: mov v3.d[1], v16.d[1] ; CHECK-NEXT: mov v0.d[1], v7.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] ; CHECK-NEXT: mov v2.d[1], v17.d[1] -; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 ; CHECK-NEXT: cmlt v7.8h, v2.8h, #0 -; CHECK-NEXT: add v4.4s, v6.4s, v4.4s ; CHECK-NEXT: add v3.4s, v1.4s, v3.4s ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v4.4s, v6.4s, v4.4s ; CHECK-NEXT: add v2.4s, v7.4s, v2.4s ; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: eor v2.16b, v2.16b, v7.16b ; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v3.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll index 29f9c0336bbcc..542b2e90ffc15 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll @@ -9,9 +9,9 @@ define @vec_scalable_subvec_scalable_idx_zero_i8(ptr %a, ptr % ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h ; CHECK-NEXT: ret @@ -25,9 +25,9 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i8(ptr %a, pt ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] -; CHECK-NEXT: ld1b { z1.s }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x1] ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h ; CHECK-NEXT: ret @@ -41,9 +41,9 @@ define @vec_scalable_subvec_scalable_idx_zero_i16(ptr %a, ptr ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: ret @@ -57,9 +57,9 @@ define @vec_scalable_subvec_scalable_idx_nonzero_i16(ptr %a, ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1h { z1.d }, p1/z, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x1] ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret @@ -76,10 +76,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i8(ptr %a, ptr %b) ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.h, vl8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] -; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <8 x i8>, ptr %b @@ -92,19 +92,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr % ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: mov w9, #8 // =0x8 ; CHECK-NEXT: cmp x8, #8 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0] -; CHECK-NEXT: st1h { z1.h }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -120,10 +120,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i16(ptr %a, ptr %b ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.s, vl4 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] -; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ptrue p0.s, vl4 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <4 x i16>, ptr %b @@ -136,19 +136,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: mov w9, #4 // =0x4 ; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0] -; CHECK-NEXT: st1w { z1.s }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -164,10 +164,10 @@ define @vec_scalable_subvec_fixed_idx_zero_i32(ptr %a, ptr %b ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] -; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d +; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d ; CHECK-NEXT: ret %vec = load , ptr %a %subvec = load <2 x i32>, ptr %b @@ -180,19 +180,19 @@ define @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 -; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr d1, [x1] ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: mov w9, #2 // =0x2 ; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: ushll v1.2d, v1.2s, #0 ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0] -; CHECK-NEXT: st1d { z1.d }, p0, [sp] -; CHECK-NEXT: str q0, [x9, x8] +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll index 1a4ab6ab334a6..9bd2ed240810d 100644 --- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll +++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s @@ -31,28 +32,28 @@ define i32 @ctz_nxv32i1( %a) #0 { ; CHECK-NEXT: neg x8, x8 ; CHECK-NEXT: punpklo p3.h, p1.b ; CHECK-NEXT: rdvl x9, #2 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: rdvl x8, #-1 -; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: mov z2.h, w8 -; CHECK-NEXT: inch z0.h, all, mul #4 +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: mov z3.h, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: inch z0.h, all, mul #4 ; CHECK-NEXT: mov z5.h, p3/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: add z1.h, z0.h, z1.h -; CHECK-NEXT: add z4.h, z0.h, z2.h ; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.h, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z4.h, z0.h, z2.h ; CHECK-NEXT: and z0.d, z0.d, z3.d ; CHECK-NEXT: add z2.h, z1.h, z2.h ; CHECK-NEXT: and z3.d, z4.d, z5.d ; CHECK-NEXT: and z1.d, z1.d, z6.d ; CHECK-NEXT: and z2.d, z2.d, z7.d -; CHECK-NEXT: umax z0.h, p2/m, z0.h, z3.h -; CHECK-NEXT: umax z1.h, p2/m, z1.h, z2.h -; CHECK-NEXT: umax z0.h, p2/m, z0.h, z1.h -; CHECK-NEXT: umaxv h0, p2, z0.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z3.h +; CHECK-NEXT: umax z1.h, p0/m, z1.h, z2.h +; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w8, w9, w8 ; CHECK-NEXT: and w0, w8, #0xffff @@ -65,12 +66,12 @@ define i32 @ctz_nxv4i32( %a) #0 { ; CHECK-LABEL: ctz_nxv4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: index z1.s, #0, #-1 ; CHECK-NEXT: cntw x9 -; CHECK-NEXT: incw z1.s ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z0.d, z1.d, z0.d +; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: incw z0.s +; CHECK-NEXT: and z0.d, z0.d, z1.d ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w8, s0 @@ -87,38 +88,38 @@ define i64 @vscale_4096( %a) #1 { ; CHECK-LABEL: vscale_4096: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: index z1.s, #0, #-1 ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: cnth x9 ; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: neg x8, x9 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: index z0.s, #0, #-1 +; CHECK-NEXT: mov z0.s, w8 +; CHECK-NEXT: cnth x8 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: incw z1.s, all, mul #4 +; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: incw z0.s, all, mul #4 -; CHECK-NEXT: add z1.s, z0.s, z1.s -; CHECK-NEXT: add z5.s, z0.s, z2.s +; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: add z4.s, z1.s, z2.s ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpkhi p3.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z2.s, z1.s, z2.s -; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: mov z3.s, p2/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p2.s -; CHECK-NEXT: mov z4.s, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: add z2.s, z0.s, z2.s +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: mov z5.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z7.s, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: and z1.d, z1.d, z3.d -; CHECK-NEXT: and z2.d, z2.d, z4.d -; CHECK-NEXT: and z3.d, z5.d, z6.d -; CHECK-NEXT: and z0.d, z0.d, z7.d -; CHECK-NEXT: umax z1.s, p2/m, z1.s, z2.s -; CHECK-NEXT: umax z0.s, p2/m, z0.s, z3.s -; CHECK-NEXT: umax z0.s, p2/m, z0.s, z1.s -; CHECK-NEXT: umaxv s0, p2, z0.s +; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: and z2.d, z2.d, z5.d +; CHECK-NEXT: and z3.d, z4.d, z6.d +; CHECK-NEXT: and z1.d, z1.d, z7.d +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: umax z1.s, p0/m, z1.s, z3.s +; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sub w0, w9, w8 ; CHECK-NEXT: ret @@ -130,21 +131,21 @@ define i64 @vscale_4096_poison( %a) #1 { ; CHECK-LABEL: vscale_4096_poison: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: index z1.h, #0, #-1 ; CHECK-NEXT: cnth x8 -; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: neg x8, x8 -; CHECK-NEXT: mov z1.h, w8 +; CHECK-NEXT: rdvl x9, #1 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: index z0.h, #0, #-1 +; CHECK-NEXT: mov z0.h, w8 +; CHECK-NEXT: inch z1.h, all, mul #2 ; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: inch z0.h, all, mul #2 -; CHECK-NEXT: add z1.h, z0.h, z1.h +; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: mov z2.h, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: and z1.d, z1.d, z2.d -; CHECK-NEXT: and z0.d, z0.d, z3.d +; CHECK-NEXT: and z0.d, z0.d, z2.d +; CHECK-NEXT: and z1.d, z1.d, z3.d ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w8, s0 @@ -161,16 +162,16 @@ define i32 @ctz_nxv8i1_no_range( %a) { ; CHECK-LABEL: ctz_nxv8i1_no_range: ; CHECK: // %bb.0: ; CHECK-NEXT: index z0.s, #0, #-1 -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: cntw x8 -; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: cnth x9 ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: incw z0.s, all, mul #2 ; CHECK-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: incw z0.s, all, mul #2 ; CHECK-NEXT: add z1.s, z0.s, z1.s ; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: and z1.d, z1.d, z3.d @@ -212,8 +213,8 @@ define i32 @ctz_nxv16i1_poison( %pg, %a) { define i32 @ctz_and_nxv16i1( %pg, %a, %b) { ; CHECK-LABEL: ctz_and_nxv16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: brkb p0.b, p1/z, p0.b ; CHECK-NEXT: cntp x0, p0, p0.b ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index f5a7b5dc9f492..ae4ced258bb8e 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -6555,18 +6555,18 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: scvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6582,18 +6582,18 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ucvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6606,17 +6606,17 @@ define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: scvtf v0.2d, v0.2d ; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6629,17 +6629,17 @@ define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ucvtf v0.2d, v0.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v2.16b, v3.16b, v2.16b -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: fcmeq v2.4s, v0.4s, v0.4s +; CHECK-NEXT: add v2.4s, v0.4s, v2.4s +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b +; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -6658,22 +6658,22 @@ define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v4.4s, v2.4s, #16 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v1.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-NEXT: and v1.16b, v5.16b, v1.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret entry: @@ -6692,22 +6692,22 @@ define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-NEXT: fcvtn v0.2s, v0.2d ; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v4.4s, v2.4s, #16 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v1.4s -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: and v4.16b, v4.16b, v3.16b -; CHECK-NEXT: and v3.16b, v5.16b, v3.16b +; CHECK-NEXT: add v6.4s, v2.4s, v3.4s +; CHECK-NEXT: add v3.4s, v0.4s, v3.4s +; CHECK-NEXT: and v4.16b, v4.16b, v1.16b +; CHECK-NEXT: and v1.16b, v5.16b, v1.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s +; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret entry: @@ -6718,51 +6718,51 @@ entry: define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: stofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v2.2d, v2.2d ; CHECK-NEXT: scvtf v0.2d, v0.2d +; CHECK-NEXT: scvtf v2.2d, v2.2d ; CHECK-NEXT: scvtf v6.2d, v6.2d ; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d ; CHECK-NEXT: scvtf v1.2d, v1.2d +; CHECK-NEXT: scvtf v3.2d, v3.2d ; CHECK-NEXT: scvtf v7.2d, v7.2d ; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v6.2s, v6.2d ; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: add v17.4s, v2.4s, v1.4s -; CHECK-NEXT: add v19.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v2.4s, #16 ; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v18.4s, v4.4s, #16 -; CHECK-NEXT: add v20.4s, v6.4s, v1.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: and v5.16b, v5.16b, v3.16b -; CHECK-NEXT: and v7.16b, v7.16b, v3.16b -; CHECK-NEXT: and v16.16b, v16.16b, v3.16b -; CHECK-NEXT: and v3.16b, v18.16b, v3.16b -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: ushr v17.4s, v4.4s, #16 +; CHECK-NEXT: add v19.4s, v0.4s, v3.4s +; CHECK-NEXT: add v18.4s, v2.4s, v3.4s +; CHECK-NEXT: add v20.4s, v6.4s, v3.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v7.4s, v7.4s, v19.4s ; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v5.4s, v5.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s +; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: orr v6.4s, #64, lsl #16 ; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h @@ -6776,51 +6776,51 @@ entry: define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: utofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v2.2d, v2.2d ; CHECK-NEXT: ucvtf v0.2d, v0.2d +; CHECK-NEXT: ucvtf v2.2d, v2.2d ; CHECK-NEXT: ucvtf v6.2d, v6.2d ; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d ; CHECK-NEXT: ucvtf v1.2d, v1.2d +; CHECK-NEXT: ucvtf v3.2d, v3.2d ; CHECK-NEXT: ucvtf v7.2d, v7.2d ; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fcvtn v2.2s, v2.2d ; CHECK-NEXT: fcvtn v6.2s, v6.2d ; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-NEXT: fcvtn2 v6.4s, v7.2d ; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: movi v3.4s, #1 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: add v17.4s, v2.4s, v1.4s -; CHECK-NEXT: add v19.4s, v0.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v2.4s, #16 ; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v18.4s, v4.4s, #16 -; CHECK-NEXT: add v20.4s, v6.4s, v1.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s -; CHECK-NEXT: and v5.16b, v5.16b, v3.16b -; CHECK-NEXT: and v7.16b, v7.16b, v3.16b -; CHECK-NEXT: and v16.16b, v16.16b, v3.16b -; CHECK-NEXT: and v3.16b, v18.16b, v3.16b -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: ushr v17.4s, v4.4s, #16 +; CHECK-NEXT: add v19.4s, v0.4s, v3.4s +; CHECK-NEXT: add v18.4s, v2.4s, v3.4s +; CHECK-NEXT: add v20.4s, v6.4s, v3.4s +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v16.16b, v16.16b, v1.16b +; CHECK-NEXT: and v1.16b, v17.16b, v1.16b ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 ; CHECK-NEXT: add v7.4s, v7.4s, v19.4s ; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: add v5.4s, v5.4s, v18.4s +; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s +; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 ; CHECK-NEXT: orr v6.4s, #64, lsl #16 ; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: mov v5.16b, v19.16b +; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b ; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b ; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h @@ -6834,107 +6834,107 @@ entry: define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: stofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v16.2d, v2.2d -; CHECK-NEXT: scvtf v17.2d, v0.2d -; CHECK-NEXT: scvtf v18.2d, v3.2d -; CHECK-NEXT: scvtf v19.2d, v6.2d -; CHECK-NEXT: ldp q24, q23, [sp, #96] -; CHECK-NEXT: scvtf v21.2d, v1.2d -; CHECK-NEXT: scvtf v22.2d, v4.2d +; CHECK-NEXT: scvtf v17.2d, v2.2d +; CHECK-NEXT: scvtf v18.2d, v0.2d +; CHECK-NEXT: scvtf v19.2d, v3.2d +; CHECK-NEXT: scvtf v3.2d, v6.2d +; CHECK-NEXT: ldp q21, q20, [sp, #32] +; CHECK-NEXT: scvtf v4.2d, v4.2d ; CHECK-NEXT: scvtf v6.2d, v7.2d -; CHECK-NEXT: scvtf v7.2d, v5.2d -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v16.2d -; CHECK-NEXT: ldp q20, q16, [sp, #32] -; CHECK-NEXT: fcvtn v1.2s, v17.2d -; CHECK-NEXT: ldp q5, q17, [sp] -; CHECK-NEXT: fcvtn v4.2s, v19.2d -; CHECK-NEXT: scvtf v23.2d, v23.2d +; CHECK-NEXT: scvtf v5.2d, v5.2d +; CHECK-NEXT: ldp q24, q23, [sp, #64] +; CHECK-NEXT: movi v16.4s, #1 +; CHECK-NEXT: fcvtn v0.2s, v17.2d +; CHECK-NEXT: scvtf v17.2d, v1.2d +; CHECK-NEXT: fcvtn v1.2s, v18.2d +; CHECK-NEXT: fcvtn v3.2s, v3.2d +; CHECK-NEXT: ldp q18, q7, [sp] +; CHECK-NEXT: scvtf v21.2d, v21.2d +; CHECK-NEXT: fcvtn v4.2s, v4.2d +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: scvtf v20.2d, v20.2d -; CHECK-NEXT: scvtf v16.2d, v16.2d -; CHECK-NEXT: fcvtn2 v0.4s, v18.2d -; CHECK-NEXT: ldp q19, q18, [sp, #64] -; CHECK-NEXT: scvtf v25.2d, v5.2d -; CHECK-NEXT: fcvtn v5.2s, v22.2d -; CHECK-NEXT: fcvtn2 v1.4s, v21.2d -; CHECK-NEXT: scvtf v21.2d, v24.2d -; CHECK-NEXT: scvtf v17.2d, v17.2d -; CHECK-NEXT: fcvtn2 v4.4s, v6.2d +; CHECK-NEXT: fcvtn2 v0.4s, v19.2d +; CHECK-NEXT: ldp q22, q19, [sp, #96] +; CHECK-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-NEXT: fcvtn2 v3.4s, v6.2d +; CHECK-NEXT: scvtf v18.2d, v18.2d +; CHECK-NEXT: scvtf v17.2d, v24.2d +; CHECK-NEXT: fcvtn v6.2s, v21.2d +; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: scvtf v22.2d, v22.2d +; CHECK-NEXT: scvtf v21.2d, v23.2d +; CHECK-NEXT: scvtf v7.2d, v7.2d +; CHECK-NEXT: ushr v24.4s, v0.4s, #16 +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s ; CHECK-NEXT: scvtf v19.2d, v19.2d -; CHECK-NEXT: scvtf v6.2d, v18.2d -; CHECK-NEXT: fcvtn v18.2s, v20.2d -; CHECK-NEXT: ushr v22.4s, v0.4s, #16 -; CHECK-NEXT: add v20.4s, v0.4s, v3.4s -; CHECK-NEXT: fcvtn2 v5.4s, v7.2d -; CHECK-NEXT: fcvtn v24.2s, v25.2d -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 -; CHECK-NEXT: fcvtn v21.2s, v21.2d -; CHECK-NEXT: add v26.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v27.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn v19.2s, v19.2d -; CHECK-NEXT: fcvtn2 v18.4s, v16.2d -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b -; CHECK-NEXT: and v7.16b, v7.16b, v2.16b -; CHECK-NEXT: fcmeq v25.4s, v0.4s, v0.4s +; CHECK-NEXT: ushr v23.4s, v1.4s, #16 +; CHECK-NEXT: ushr v25.4s, v3.4s, #16 +; CHECK-NEXT: fcvtn v18.2s, v18.2d +; CHECK-NEXT: fcvtn2 v6.4s, v20.2d +; CHECK-NEXT: add v26.4s, v1.4s, v2.4s +; CHECK-NEXT: fcvtn v17.2s, v17.2d +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: fcvtn v22.2s, v22.2d +; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s +; CHECK-NEXT: and v23.16b, v23.16b, v16.16b ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v5.4s, #16 -; CHECK-NEXT: fcvtn2 v24.4s, v17.2d -; CHECK-NEXT: add v17.4s, v5.4s, v3.4s -; CHECK-NEXT: fcvtn2 v21.4s, v23.2d -; CHECK-NEXT: and v16.16b, v27.16b, v2.16b -; CHECK-NEXT: add v20.4s, v22.4s, v20.4s -; CHECK-NEXT: fcvtn2 v19.4s, v6.2d -; CHECK-NEXT: add v7.4s, v7.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v18.4s, #16 -; CHECK-NEXT: and v23.16b, v28.16b, v2.16b -; CHECK-NEXT: add v22.4s, v4.4s, v3.4s -; CHECK-NEXT: fcmeq v6.4s, v1.4s, v1.4s -; CHECK-NEXT: ushr v27.4s, v24.4s, #16 -; CHECK-NEXT: add v30.4s, v24.4s, v3.4s +; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s +; CHECK-NEXT: fcvtn2 v18.4s, v7.2d +; CHECK-NEXT: add v7.4s, v3.4s, v2.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v5.4s, v24.4s, v5.4s +; CHECK-NEXT: and v24.16b, v25.16b, v16.16b +; CHECK-NEXT: ushr v25.4s, v4.4s, #16 +; CHECK-NEXT: fcvtn2 v22.4s, v19.2d +; CHECK-NEXT: add v19.4s, v23.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v6.4s, #16 +; CHECK-NEXT: fcvtn2 v17.4s, v21.2d +; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v21.4s, #16 -; CHECK-NEXT: add v31.4s, v21.4s, v3.4s -; CHECK-NEXT: and v26.16b, v26.16b, v2.16b -; CHECK-NEXT: add v17.4s, v23.4s, v17.4s -; CHECK-NEXT: add v23.4s, v18.4s, v3.4s -; CHECK-NEXT: ushr v29.4s, v19.4s, #16 -; CHECK-NEXT: and v27.16b, v27.16b, v2.16b -; CHECK-NEXT: add v3.4s, v19.4s, v3.4s -; CHECK-NEXT: add v16.4s, v16.4s, v22.4s -; CHECK-NEXT: and v28.16b, v28.16b, v2.16b -; CHECK-NEXT: fcmeq v22.4s, v4.4s, v4.4s -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: and v2.16b, v29.16b, v2.16b -; CHECK-NEXT: fcmeq v29.4s, v5.4s, v5.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: add v23.4s, v26.4s, v23.4s -; CHECK-NEXT: fcmeq v26.4s, v18.4s, v18.4s -; CHECK-NEXT: add v27.4s, v27.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v24.4s, v24.4s +; CHECK-NEXT: and v23.16b, v25.16b, v16.16b +; CHECK-NEXT: add v25.4s, v4.4s, v2.4s +; CHECK-NEXT: add v7.4s, v24.4s, v7.4s +; CHECK-NEXT: ushr v24.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v18.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b +; CHECK-NEXT: ushr v28.4s, v22.4s, #16 +; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: add v23.4s, v23.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v16.16b +; CHECK-NEXT: add v26.4s, v6.4s, v2.4s +; CHECK-NEXT: ushr v29.4s, v17.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: add v2.4s, v17.4s, v2.4s +; CHECK-NEXT: and v28.16b, v28.16b, v16.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b +; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s +; CHECK-NEXT: orr v6.4s, #64, lsl #16 +; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: add v24.4s, v24.4s, v30.4s +; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v21.4s, v21.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v19.4s, v19.4s +; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v24.4s, #64, lsl #16 -; CHECK-NEXT: orr v21.4s, #64, lsl #16 -; CHECK-NEXT: orr v19.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v7.16b, v6.16b -; CHECK-NEXT: bit v4.16b, v16.16b, v22.16b -; CHECK-NEXT: mov v6.16b, v26.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v0.16b, v20.16b, v25.16b -; CHECK-NEXT: bit v5.16b, v17.16b, v29.16b -; CHECK-NEXT: bsl v3.16b, v2.16b, v19.16b -; CHECK-NEXT: bsl v6.16b, v23.16b, v18.16b -; CHECK-NEXT: bsl v7.16b, v27.16b, v24.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v21.16b +; CHECK-NEXT: orr v22.4s, #64, lsl #16 +; CHECK-NEXT: mov v5.16b, v26.16b +; CHECK-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: orr v17.4s, #64, lsl #16 ; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: uzp2 v1.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v6.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b +; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v30.16b +; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h +; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h +; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h ; CHECK-NEXT: ret entry: %c = sitofp <32 x i64> %a to <32 x bfloat> @@ -6944,107 +6944,107 @@ entry: define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: utofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v16.2d, v2.2d -; CHECK-NEXT: ucvtf v17.2d, v0.2d -; CHECK-NEXT: ucvtf v18.2d, v3.2d -; CHECK-NEXT: ucvtf v19.2d, v6.2d -; CHECK-NEXT: ldp q24, q23, [sp, #96] -; CHECK-NEXT: ucvtf v21.2d, v1.2d -; CHECK-NEXT: ucvtf v22.2d, v4.2d +; CHECK-NEXT: ucvtf v17.2d, v2.2d +; CHECK-NEXT: ucvtf v18.2d, v0.2d +; CHECK-NEXT: ucvtf v19.2d, v3.2d +; CHECK-NEXT: ucvtf v3.2d, v6.2d +; CHECK-NEXT: ldp q21, q20, [sp, #32] +; CHECK-NEXT: ucvtf v4.2d, v4.2d ; CHECK-NEXT: ucvtf v6.2d, v7.2d -; CHECK-NEXT: ucvtf v7.2d, v5.2d -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v16.2d -; CHECK-NEXT: ldp q20, q16, [sp, #32] -; CHECK-NEXT: fcvtn v1.2s, v17.2d -; CHECK-NEXT: ldp q5, q17, [sp] -; CHECK-NEXT: fcvtn v4.2s, v19.2d -; CHECK-NEXT: ucvtf v23.2d, v23.2d +; CHECK-NEXT: ucvtf v5.2d, v5.2d +; CHECK-NEXT: ldp q24, q23, [sp, #64] +; CHECK-NEXT: movi v16.4s, #1 +; CHECK-NEXT: fcvtn v0.2s, v17.2d +; CHECK-NEXT: ucvtf v17.2d, v1.2d +; CHECK-NEXT: fcvtn v1.2s, v18.2d +; CHECK-NEXT: fcvtn v3.2s, v3.2d +; CHECK-NEXT: ldp q18, q7, [sp] +; CHECK-NEXT: ucvtf v21.2d, v21.2d +; CHECK-NEXT: fcvtn v4.2s, v4.2d +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: ucvtf v16.2d, v16.2d -; CHECK-NEXT: fcvtn2 v0.4s, v18.2d -; CHECK-NEXT: ldp q19, q18, [sp, #64] -; CHECK-NEXT: ucvtf v25.2d, v5.2d -; CHECK-NEXT: fcvtn v5.2s, v22.2d -; CHECK-NEXT: fcvtn2 v1.4s, v21.2d -; CHECK-NEXT: ucvtf v21.2d, v24.2d -; CHECK-NEXT: ucvtf v17.2d, v17.2d -; CHECK-NEXT: fcvtn2 v4.4s, v6.2d +; CHECK-NEXT: fcvtn2 v0.4s, v19.2d +; CHECK-NEXT: ldp q22, q19, [sp, #96] +; CHECK-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-NEXT: fcvtn2 v3.4s, v6.2d +; CHECK-NEXT: ucvtf v18.2d, v18.2d +; CHECK-NEXT: ucvtf v17.2d, v24.2d +; CHECK-NEXT: fcvtn v6.2s, v21.2d +; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: ucvtf v22.2d, v22.2d +; CHECK-NEXT: ucvtf v21.2d, v23.2d +; CHECK-NEXT: ucvtf v7.2d, v7.2d +; CHECK-NEXT: ushr v24.4s, v0.4s, #16 +; CHECK-NEXT: add v5.4s, v0.4s, v2.4s ; CHECK-NEXT: ucvtf v19.2d, v19.2d -; CHECK-NEXT: ucvtf v6.2d, v18.2d -; CHECK-NEXT: fcvtn v18.2s, v20.2d -; CHECK-NEXT: ushr v22.4s, v0.4s, #16 -; CHECK-NEXT: add v20.4s, v0.4s, v3.4s -; CHECK-NEXT: fcvtn2 v5.4s, v7.2d -; CHECK-NEXT: fcvtn v24.2s, v25.2d -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 -; CHECK-NEXT: fcvtn v21.2s, v21.2d -; CHECK-NEXT: add v26.4s, v1.4s, v3.4s -; CHECK-NEXT: ushr v27.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn v19.2s, v19.2d -; CHECK-NEXT: fcvtn2 v18.4s, v16.2d -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b -; CHECK-NEXT: and v7.16b, v7.16b, v2.16b -; CHECK-NEXT: fcmeq v25.4s, v0.4s, v0.4s +; CHECK-NEXT: ushr v23.4s, v1.4s, #16 +; CHECK-NEXT: ushr v25.4s, v3.4s, #16 +; CHECK-NEXT: fcvtn v18.2s, v18.2d +; CHECK-NEXT: fcvtn2 v6.4s, v20.2d +; CHECK-NEXT: add v26.4s, v1.4s, v2.4s +; CHECK-NEXT: fcvtn v17.2s, v17.2d +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: fcvtn v22.2s, v22.2d +; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s +; CHECK-NEXT: and v23.16b, v23.16b, v16.16b ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v5.4s, #16 -; CHECK-NEXT: fcvtn2 v24.4s, v17.2d -; CHECK-NEXT: add v17.4s, v5.4s, v3.4s -; CHECK-NEXT: fcvtn2 v21.4s, v23.2d -; CHECK-NEXT: and v16.16b, v27.16b, v2.16b -; CHECK-NEXT: add v20.4s, v22.4s, v20.4s -; CHECK-NEXT: fcvtn2 v19.4s, v6.2d -; CHECK-NEXT: add v7.4s, v7.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v18.4s, #16 -; CHECK-NEXT: and v23.16b, v28.16b, v2.16b -; CHECK-NEXT: add v22.4s, v4.4s, v3.4s -; CHECK-NEXT: fcmeq v6.4s, v1.4s, v1.4s -; CHECK-NEXT: ushr v27.4s, v24.4s, #16 -; CHECK-NEXT: add v30.4s, v24.4s, v3.4s +; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s +; CHECK-NEXT: fcvtn2 v18.4s, v7.2d +; CHECK-NEXT: add v7.4s, v3.4s, v2.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v5.4s, v24.4s, v5.4s +; CHECK-NEXT: and v24.16b, v25.16b, v16.16b +; CHECK-NEXT: ushr v25.4s, v4.4s, #16 +; CHECK-NEXT: fcvtn2 v22.4s, v19.2d +; CHECK-NEXT: add v19.4s, v23.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v6.4s, #16 +; CHECK-NEXT: fcvtn2 v17.4s, v21.2d +; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: ushr v28.4s, v21.4s, #16 -; CHECK-NEXT: add v31.4s, v21.4s, v3.4s -; CHECK-NEXT: and v26.16b, v26.16b, v2.16b -; CHECK-NEXT: add v17.4s, v23.4s, v17.4s -; CHECK-NEXT: add v23.4s, v18.4s, v3.4s -; CHECK-NEXT: ushr v29.4s, v19.4s, #16 -; CHECK-NEXT: and v27.16b, v27.16b, v2.16b -; CHECK-NEXT: add v3.4s, v19.4s, v3.4s -; CHECK-NEXT: add v16.4s, v16.4s, v22.4s -; CHECK-NEXT: and v28.16b, v28.16b, v2.16b -; CHECK-NEXT: fcmeq v22.4s, v4.4s, v4.4s -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: and v2.16b, v29.16b, v2.16b -; CHECK-NEXT: fcmeq v29.4s, v5.4s, v5.4s -; CHECK-NEXT: orr v5.4s, #64, lsl #16 -; CHECK-NEXT: add v23.4s, v26.4s, v23.4s -; CHECK-NEXT: fcmeq v26.4s, v18.4s, v18.4s -; CHECK-NEXT: add v27.4s, v27.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v24.4s, v24.4s +; CHECK-NEXT: and v23.16b, v25.16b, v16.16b +; CHECK-NEXT: add v25.4s, v4.4s, v2.4s +; CHECK-NEXT: add v7.4s, v24.4s, v7.4s +; CHECK-NEXT: ushr v24.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v18.4s, v2.4s +; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b +; CHECK-NEXT: ushr v28.4s, v22.4s, #16 +; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: add v23.4s, v23.4s, v25.4s +; CHECK-NEXT: and v25.16b, v26.16b, v16.16b +; CHECK-NEXT: add v26.4s, v6.4s, v2.4s +; CHECK-NEXT: ushr v29.4s, v17.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b +; CHECK-NEXT: add v2.4s, v17.4s, v2.4s +; CHECK-NEXT: and v28.16b, v28.16b, v16.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b +; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s +; CHECK-NEXT: orr v6.4s, #64, lsl #16 +; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: add v24.4s, v24.4s, v30.4s +; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v21.4s, v21.4s -; CHECK-NEXT: add v2.4s, v2.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v19.4s, v19.4s +; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 ; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v24.4s, #64, lsl #16 -; CHECK-NEXT: orr v21.4s, #64, lsl #16 -; CHECK-NEXT: orr v19.4s, #64, lsl #16 -; CHECK-NEXT: bit v1.16b, v7.16b, v6.16b -; CHECK-NEXT: bit v4.16b, v16.16b, v22.16b -; CHECK-NEXT: mov v6.16b, v26.16b -; CHECK-NEXT: mov v7.16b, v30.16b -; CHECK-NEXT: mov v16.16b, v31.16b -; CHECK-NEXT: bit v0.16b, v20.16b, v25.16b -; CHECK-NEXT: bit v5.16b, v17.16b, v29.16b -; CHECK-NEXT: bsl v3.16b, v2.16b, v19.16b -; CHECK-NEXT: bsl v6.16b, v23.16b, v18.16b -; CHECK-NEXT: bsl v7.16b, v27.16b, v24.16b -; CHECK-NEXT: bsl v16.16b, v28.16b, v21.16b +; CHECK-NEXT: orr v22.4s, #64, lsl #16 +; CHECK-NEXT: mov v5.16b, v26.16b +; CHECK-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: orr v17.4s, #64, lsl #16 ; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: uzp2 v1.8h, v5.8h, v4.8h -; CHECK-NEXT: uzp2 v2.8h, v7.8h, v6.8h -; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h +; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b +; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v30.16b +; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b +; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b +; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h +; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h +; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h ; CHECK-NEXT: ret entry: %c = uitofp <32 x i64> %a to <32 x bfloat> @@ -7059,9 +7059,9 @@ define <2 x bfloat> @stofp_v2i32_v2bf16(<2 x i32> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <2 x i32> %a to <2 x bfloat> @@ -7076,9 +7076,9 @@ define <2 x bfloat> @utofp_v2i32_v2bf16(<2 x i32> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <2 x i32> %a to <2 x bfloat> @@ -7092,9 +7092,9 @@ define <3 x bfloat> @stofp_v3i32_v3bf16(<3 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i32> %a to <3 x bfloat> @@ -7108,9 +7108,9 @@ define <3 x bfloat> @utofp_v3i32_v3bf16(<3 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i32> %a to <3 x bfloat> @@ -7124,9 +7124,9 @@ define <4 x bfloat> @stofp_v4i32_v4bf16(<4 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i32> %a to <4 x bfloat> @@ -7140,9 +7140,9 @@ define <4 x bfloat> @utofp_v4i32_v4bf16(<4 x i32> %a) { ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i32> %a to <4 x bfloat> @@ -7155,15 +7155,15 @@ define <8 x bfloat> @stofp_v8i32_v8bf16(<8 x i32> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: movi v5.4s, #127, msl #8 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v4.4s, v1.4s, #16 ; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v2.16b, v4.16b, v2.16b ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v3.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v1.4s, v5.4s ; CHECK-NEXT: ret entry: %c = sitofp <8 x i32> %a to <8 x bfloat> @@ -7176,15 +7176,15 @@ define <8 x bfloat> @utofp_v8i32_v8bf16(<8 x i32> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: movi v2.4s, #1 ; CHECK-NEXT: ucvtf v1.4s, v1.4s +; CHECK-NEXT: movi v5.4s, #127, msl #8 ; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v4.4s, v1.4s, #16 ; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v2.16b, v4.16b, v2.16b ; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: movi v3.4s, #127, msl #8 ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v1.4s, v3.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v1.4s, v5.4s ; CHECK-NEXT: ret entry: %c = uitofp <8 x i32> %a to <8 x bfloat> @@ -7194,28 +7194,28 @@ entry: define <16 x bfloat> @stofp_v16i32_v16bf16(<16 x i32> %a) { ; CHECK-LABEL: stofp_v16i32_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: scvtf v2.4s, v2.4s -; CHECK-NEXT: movi v4.4s, #1 -; CHECK-NEXT: scvtf v1.4s, v1.4s +; CHECK-NEXT: scvtf v0.4s, v0.4s +; CHECK-NEXT: scvtf v4.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: scvtf v3.4s, v3.4s +; CHECK-NEXT: movi v17.4s, #127, msl #8 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 ; CHECK-NEXT: ushr v6.4s, v2.4s, #16 -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 +; CHECK-NEXT: ushr v7.4s, v4.4s, #16 ; CHECK-NEXT: ushr v16.4s, v3.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v4.16b -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: and v5.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v16.16b, v4.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v6.4s -; CHECK-NEXT: addhn2 v0.8h, v5.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v3.4s, v6.4s +; CHECK-NEXT: and v5.16b, v7.16b, v1.16b +; CHECK-NEXT: and v6.16b, v16.16b, v1.16b +; CHECK-NEXT: addhn v0.4h, v0.4s, v17.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v17.4s +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: addhn2 v0.8h, v2.4s, v17.4s +; CHECK-NEXT: addhn2 v1.8h, v3.4s, v17.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i32> %a to <16 x bfloat> @@ -7225,28 +7225,28 @@ entry: define <16 x bfloat> @utofp_v16i32_v16bf16(<16 x i32> %a) { ; CHECK-LABEL: utofp_v16i32_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ucvtf v2.4s, v2.4s -; CHECK-NEXT: movi v4.4s, #1 -; CHECK-NEXT: ucvtf v1.4s, v1.4s +; CHECK-NEXT: ucvtf v0.4s, v0.4s +; CHECK-NEXT: ucvtf v4.4s, v1.4s +; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: ucvtf v3.4s, v3.4s +; CHECK-NEXT: movi v17.4s, #127, msl #8 ; CHECK-NEXT: ushr v5.4s, v0.4s, #16 ; CHECK-NEXT: ushr v6.4s, v2.4s, #16 -; CHECK-NEXT: ushr v7.4s, v1.4s, #16 +; CHECK-NEXT: ushr v7.4s, v4.4s, #16 ; CHECK-NEXT: ushr v16.4s, v3.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v4.16b -; CHECK-NEXT: and v6.16b, v6.16b, v4.16b +; CHECK-NEXT: and v5.16b, v5.16b, v1.16b +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: movi v6.4s, #127, msl #8 -; CHECK-NEXT: and v5.16b, v7.16b, v4.16b -; CHECK-NEXT: and v4.16b, v16.16b, v4.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v6.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v6.4s -; CHECK-NEXT: addhn2 v0.8h, v5.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v3.4s, v6.4s +; CHECK-NEXT: and v5.16b, v7.16b, v1.16b +; CHECK-NEXT: and v6.16b, v16.16b, v1.16b +; CHECK-NEXT: addhn v0.4h, v0.4s, v17.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v17.4s +; CHECK-NEXT: add v2.4s, v5.4s, v4.4s +; CHECK-NEXT: add v3.4s, v6.4s, v3.4s +; CHECK-NEXT: addhn2 v0.8h, v2.4s, v17.4s +; CHECK-NEXT: addhn2 v1.8h, v3.4s, v17.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i32> %a to <16 x bfloat> @@ -7262,42 +7262,42 @@ define <32 x bfloat> @stofp_v32i32_v32bf16(<32 x i32> %a) { ; CHECK-NEXT: scvtf v6.4s, v6.4s ; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: scvtf v1.4s, v1.4s -; CHECK-NEXT: scvtf v3.4s, v3.4s +; CHECK-NEXT: scvtf v17.4s, v3.4s ; CHECK-NEXT: scvtf v5.4s, v5.4s ; CHECK-NEXT: scvtf v7.4s, v7.4s -; CHECK-NEXT: ushr v17.4s, v0.4s, #16 +; CHECK-NEXT: movi v21.4s, #127, msl #8 +; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v18.4s, v2.4s, #16 ; CHECK-NEXT: ushr v19.4s, v4.4s, #16 ; CHECK-NEXT: ushr v20.4s, v6.4s, #16 -; CHECK-NEXT: ushr v21.4s, v1.4s, #16 -; CHECK-NEXT: ushr v22.4s, v3.4s, #16 -; CHECK-NEXT: ushr v23.4s, v5.4s, #16 -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v5.4s, #16 +; CHECK-NEXT: ushr v25.4s, v7.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b ; CHECK-NEXT: and v18.16b, v18.16b, v16.16b ; CHECK-NEXT: and v19.16b, v19.16b, v16.16b ; CHECK-NEXT: and v20.16b, v20.16b, v16.16b -; CHECK-NEXT: and v21.16b, v21.16b, v16.16b -; CHECK-NEXT: and v22.16b, v22.16b, v16.16b -; CHECK-NEXT: add v0.4s, v17.4s, v0.4s -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: and v3.16b, v22.16b, v16.16b ; CHECK-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 ; CHECK-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v19.16b, v23.16b, v16.16b -; CHECK-NEXT: add v20.4s, v22.4s, v3.4s -; CHECK-NEXT: and v16.16b, v17.16b, v16.16b -; CHECK-NEXT: add v17.4s, v21.4s, v1.4s +; CHECK-NEXT: and v18.16b, v23.16b, v16.16b +; CHECK-NEXT: and v19.16b, v24.16b, v16.16b +; CHECK-NEXT: and v16.16b, v25.16b, v16.16b +; CHECK-NEXT: add v20.4s, v3.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v21.4s +; CHECK-NEXT: addhn v2.4h, v4.4s, v21.4s +; CHECK-NEXT: addhn v3.4h, v6.4s, v21.4s +; CHECK-NEXT: add v4.4s, v18.4s, v17.4s ; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v18.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v4.4s, v18.4s -; CHECK-NEXT: add v4.4s, v16.4s, v7.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v17.4s, v18.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v18.4s -; CHECK-NEXT: addhn2 v2.8h, v5.4s, v18.4s -; CHECK-NEXT: addhn2 v3.8h, v4.4s, v18.4s +; CHECK-NEXT: add v6.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v20.4s, v21.4s +; CHECK-NEXT: addhn2 v1.8h, v4.4s, v21.4s +; CHECK-NEXT: addhn2 v2.8h, v5.4s, v21.4s +; CHECK-NEXT: addhn2 v3.8h, v6.4s, v21.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i32> %a to <32 x bfloat> @@ -7313,42 +7313,42 @@ define <32 x bfloat> @utofp_v32i32_v32bf16(<32 x i32> %a) { ; CHECK-NEXT: ucvtf v6.4s, v6.4s ; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ucvtf v1.4s, v1.4s -; CHECK-NEXT: ucvtf v3.4s, v3.4s +; CHECK-NEXT: ucvtf v17.4s, v3.4s ; CHECK-NEXT: ucvtf v5.4s, v5.4s ; CHECK-NEXT: ucvtf v7.4s, v7.4s -; CHECK-NEXT: ushr v17.4s, v0.4s, #16 +; CHECK-NEXT: movi v21.4s, #127, msl #8 +; CHECK-NEXT: ushr v3.4s, v0.4s, #16 ; CHECK-NEXT: ushr v18.4s, v2.4s, #16 ; CHECK-NEXT: ushr v19.4s, v4.4s, #16 ; CHECK-NEXT: ushr v20.4s, v6.4s, #16 -; CHECK-NEXT: ushr v21.4s, v1.4s, #16 -; CHECK-NEXT: ushr v22.4s, v3.4s, #16 -; CHECK-NEXT: ushr v23.4s, v5.4s, #16 -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v5.4s, #16 +; CHECK-NEXT: ushr v25.4s, v7.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b ; CHECK-NEXT: and v18.16b, v18.16b, v16.16b ; CHECK-NEXT: and v19.16b, v19.16b, v16.16b ; CHECK-NEXT: and v20.16b, v20.16b, v16.16b -; CHECK-NEXT: and v21.16b, v21.16b, v16.16b -; CHECK-NEXT: and v22.16b, v22.16b, v16.16b -; CHECK-NEXT: add v0.4s, v17.4s, v0.4s -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 +; CHECK-NEXT: add v0.4s, v3.4s, v0.4s +; CHECK-NEXT: and v3.16b, v22.16b, v16.16b ; CHECK-NEXT: add v2.4s, v18.4s, v2.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 ; CHECK-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-NEXT: add v6.4s, v20.4s, v6.4s -; CHECK-NEXT: and v19.16b, v23.16b, v16.16b -; CHECK-NEXT: add v20.4s, v22.4s, v3.4s -; CHECK-NEXT: and v16.16b, v17.16b, v16.16b -; CHECK-NEXT: add v17.4s, v21.4s, v1.4s +; CHECK-NEXT: and v18.16b, v23.16b, v16.16b +; CHECK-NEXT: and v19.16b, v24.16b, v16.16b +; CHECK-NEXT: and v16.16b, v25.16b, v16.16b +; CHECK-NEXT: add v20.4s, v3.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v2.4s, v21.4s +; CHECK-NEXT: addhn v2.4h, v4.4s, v21.4s +; CHECK-NEXT: addhn v3.4h, v6.4s, v21.4s +; CHECK-NEXT: add v4.4s, v18.4s, v17.4s ; CHECK-NEXT: add v5.4s, v19.4s, v5.4s -; CHECK-NEXT: addhn v0.4h, v0.4s, v18.4s -; CHECK-NEXT: addhn v1.4h, v2.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v4.4s, v18.4s -; CHECK-NEXT: add v4.4s, v16.4s, v7.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v17.4s, v18.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v18.4s -; CHECK-NEXT: addhn2 v2.8h, v5.4s, v18.4s -; CHECK-NEXT: addhn2 v3.8h, v4.4s, v18.4s +; CHECK-NEXT: add v6.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v20.4s, v21.4s +; CHECK-NEXT: addhn2 v1.8h, v4.4s, v21.4s +; CHECK-NEXT: addhn2 v2.8h, v5.4s, v21.4s +; CHECK-NEXT: addhn2 v3.8h, v6.4s, v21.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i32> %a to <32 x bfloat> @@ -7364,9 +7364,9 @@ define <2 x bfloat> @stofp_v2i16_v2bf16(<2 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <2 x i16> %a to <2 x bfloat> @@ -7382,9 +7382,9 @@ define <2 x bfloat> @utofp_v2i16_v2bf16(<2 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <2 x i16> %a to <2 x bfloat> @@ -7399,9 +7399,9 @@ define <3 x bfloat> @stofp_v3i16_v3bf16(<3 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i16> %a to <3 x bfloat> @@ -7416,9 +7416,9 @@ define <3 x bfloat> @utofp_v3i16_v3bf16(<3 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i16> %a to <3 x bfloat> @@ -7433,9 +7433,9 @@ define <4 x bfloat> @stofp_v4i16_v4bf16(<4 x i16> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i16> %a to <4 x bfloat> @@ -7450,9 +7450,9 @@ define <4 x bfloat> @utofp_v4i16_v4bf16(<4 x i16> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i16> %a to <4 x bfloat> @@ -7513,27 +7513,27 @@ define <16 x bfloat> @stofp_v16i16_v16bf16(<16 x i16> %a) { ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v7.4s, #127, msl #8 ; CHECK-NEXT: scvtf v3.4s, v3.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s -; CHECK-NEXT: scvtf v6.4s, v0.4s -; CHECK-NEXT: scvtf v7.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: ushr v5.4s, v3.4s, #16 -; CHECK-NEXT: ushr v0.4s, v4.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v2.16b +; CHECK-NEXT: scvtf v5.4s, v0.4s +; CHECK-NEXT: scvtf v6.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v16.4s, v5.4s, #16 +; CHECK-NEXT: ushr v17.4s, v6.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v16.16b, v16.16b, v2.16b ; CHECK-NEXT: and v2.16b, v17.16b, v2.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: add v18.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v1.4s -; CHECK-NEXT: addhn v1.4h, v4.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v3.4s -; CHECK-NEXT: addhn2 v1.8h, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: addhn v0.4h, v3.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v5.4s, v3.4s +; CHECK-NEXT: addhn2 v1.8h, v6.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i16> %a to <16 x bfloat> @@ -7548,27 +7548,27 @@ define <16 x bfloat> @utofp_v16i16_v16bf16(<16 x i16> %a) { ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: movi v7.4s, #127, msl #8 ; CHECK-NEXT: ucvtf v3.4s, v3.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s -; CHECK-NEXT: ucvtf v6.4s, v0.4s -; CHECK-NEXT: ucvtf v7.4s, v1.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: ushr v5.4s, v3.4s, #16 -; CHECK-NEXT: ushr v0.4s, v4.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v7.4s, #16 -; CHECK-NEXT: and v5.16b, v5.16b, v2.16b +; CHECK-NEXT: ucvtf v5.4s, v0.4s +; CHECK-NEXT: ucvtf v6.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v3.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v16.4s, v5.4s, #16 +; CHECK-NEXT: ushr v17.4s, v6.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b ; CHECK-NEXT: and v16.16b, v16.16b, v2.16b ; CHECK-NEXT: and v2.16b, v17.16b, v2.16b -; CHECK-NEXT: add v5.4s, v5.4s, v1.4s -; CHECK-NEXT: add v18.4s, v0.4s, v1.4s -; CHECK-NEXT: add v2.4s, v2.4s, v1.4s -; CHECK-NEXT: addhn v0.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v1.4s -; CHECK-NEXT: addhn v1.4h, v4.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v3.4s -; CHECK-NEXT: addhn2 v1.8h, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v7.4s +; CHECK-NEXT: add v1.4s, v1.4s, v7.4s +; CHECK-NEXT: add v2.4s, v2.4s, v7.4s +; CHECK-NEXT: addhn v0.4h, v3.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: addhn2 v0.8h, v5.4s, v3.4s +; CHECK-NEXT: addhn2 v1.8h, v6.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i16> %a to <16 x bfloat> @@ -7578,56 +7578,56 @@ entry: define <32 x bfloat> @stofp_v32i16_v32bf16(<32 x i16> %a) { ; CHECK-LABEL: stofp_v32i16_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: sshll v5.4s, v1.4h, #0 -; CHECK-NEXT: sshll v4.4s, v0.4h, #0 +; CHECK-NEXT: sshll v4.4s, v1.4h, #0 +; CHECK-NEXT: sshll v5.4s, v0.4h, #0 ; CHECK-NEXT: sshll v6.4s, v2.4h, #0 ; CHECK-NEXT: sshll v7.4s, v3.4h, #0 ; CHECK-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-NEXT: sshll2 v3.4s, v3.8h, #0 +; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: scvtf v5.4s, v5.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s ; CHECK-NEXT: scvtf v6.4s, v6.4s ; CHECK-NEXT: scvtf v7.4s, v7.4s -; CHECK-NEXT: scvtf v19.4s, v0.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 -; CHECK-NEXT: scvtf v20.4s, v1.4s -; CHECK-NEXT: scvtf v21.4s, v2.4s -; CHECK-NEXT: scvtf v22.4s, v3.4s +; CHECK-NEXT: scvtf v17.4s, v0.4s +; CHECK-NEXT: scvtf v18.4s, v1.4s +; CHECK-NEXT: scvtf v19.4s, v2.4s +; CHECK-NEXT: scvtf v20.4s, v3.4s +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushr v0.4s, v5.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: ushr v1.4s, v6.4s, #16 -; CHECK-NEXT: ushr v2.4s, v7.4s, #16 -; CHECK-NEXT: ushr v23.4s, v20.4s, #16 -; CHECK-NEXT: ushr v25.4s, v22.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v2.4s, v6.4s, #16 +; CHECK-NEXT: ushr v3.4s, v7.4s, #16 +; CHECK-NEXT: ushr v22.4s, v17.4s, #16 +; CHECK-NEXT: ushr v23.4s, v18.4s, #16 +; CHECK-NEXT: ushr v24.4s, v19.4s, #16 +; CHECK-NEXT: ushr v25.4s, v20.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v16.16b -; CHECK-NEXT: and v3.16b, v17.16b, v16.16b ; CHECK-NEXT: and v1.16b, v1.16b, v16.16b ; CHECK-NEXT: and v2.16b, v2.16b, v16.16b -; CHECK-NEXT: ushr v17.4s, v19.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b +; CHECK-NEXT: and v22.16b, v22.16b, v16.16b ; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: add v24.4s, v0.4s, v18.4s -; CHECK-NEXT: ushr v0.4s, v21.4s, #16 -; CHECK-NEXT: add v3.4s, v3.4s, v18.4s -; CHECK-NEXT: add v26.4s, v1.4s, v18.4s -; CHECK-NEXT: add v27.4s, v2.4s, v18.4s -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b -; CHECK-NEXT: and v28.16b, v0.16b, v16.16b +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b ; CHECK-NEXT: and v16.16b, v25.16b, v16.16b -; CHECK-NEXT: addhn v0.4h, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v24.4s -; CHECK-NEXT: add v4.4s, v17.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v6.4s, v26.4s -; CHECK-NEXT: add v5.4s, v23.4s, v18.4s -; CHECK-NEXT: addhn v3.4h, v7.4s, v27.4s -; CHECK-NEXT: add v6.4s, v28.4s, v18.4s -; CHECK-NEXT: add v16.4s, v16.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v4.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v5.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v6.4s -; CHECK-NEXT: addhn2 v3.8h, v22.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v21.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: addhn v0.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: addhn v2.4h, v6.4s, v2.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v22.4s, v21.4s +; CHECK-NEXT: add v5.4s, v23.4s, v21.4s +; CHECK-NEXT: add v6.4s, v24.4s, v21.4s +; CHECK-NEXT: add v7.4s, v16.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v17.4s, v4.4s +; CHECK-NEXT: addhn2 v1.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v6.4s +; CHECK-NEXT: addhn2 v3.8h, v20.4s, v7.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i16> %a to <32 x bfloat> @@ -7637,56 +7637,56 @@ entry: define <32 x bfloat> @utofp_v32i16_v32bf16(<32 x i16> %a) { ; CHECK-LABEL: utofp_v32i16_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-NEXT: ushll v4.4s, v0.4h, #0 +; CHECK-NEXT: ushll v4.4s, v1.4h, #0 +; CHECK-NEXT: ushll v5.4s, v0.4h, #0 ; CHECK-NEXT: ushll v6.4s, v2.4h, #0 ; CHECK-NEXT: ushll v7.4s, v3.4h, #0 ; CHECK-NEXT: ushll2 v0.4s, v0.8h, #0 -; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 +; CHECK-NEXT: movi v16.4s, #1 ; CHECK-NEXT: ucvtf v5.4s, v5.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s ; CHECK-NEXT: ucvtf v6.4s, v6.4s ; CHECK-NEXT: ucvtf v7.4s, v7.4s -; CHECK-NEXT: ucvtf v19.4s, v0.4s -; CHECK-NEXT: movi v18.4s, #127, msl #8 -; CHECK-NEXT: ucvtf v20.4s, v1.4s -; CHECK-NEXT: ucvtf v21.4s, v2.4s -; CHECK-NEXT: ucvtf v22.4s, v3.4s +; CHECK-NEXT: ucvtf v17.4s, v0.4s +; CHECK-NEXT: ucvtf v18.4s, v1.4s +; CHECK-NEXT: ucvtf v19.4s, v2.4s +; CHECK-NEXT: ucvtf v20.4s, v3.4s +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushr v0.4s, v5.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: ushr v1.4s, v6.4s, #16 -; CHECK-NEXT: ushr v2.4s, v7.4s, #16 -; CHECK-NEXT: ushr v23.4s, v20.4s, #16 -; CHECK-NEXT: ushr v25.4s, v22.4s, #16 +; CHECK-NEXT: ushr v1.4s, v4.4s, #16 +; CHECK-NEXT: ushr v2.4s, v6.4s, #16 +; CHECK-NEXT: ushr v3.4s, v7.4s, #16 +; CHECK-NEXT: ushr v22.4s, v17.4s, #16 +; CHECK-NEXT: ushr v23.4s, v18.4s, #16 +; CHECK-NEXT: ushr v24.4s, v19.4s, #16 +; CHECK-NEXT: ushr v25.4s, v20.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v16.16b -; CHECK-NEXT: and v3.16b, v17.16b, v16.16b ; CHECK-NEXT: and v1.16b, v1.16b, v16.16b ; CHECK-NEXT: and v2.16b, v2.16b, v16.16b -; CHECK-NEXT: ushr v17.4s, v19.4s, #16 +; CHECK-NEXT: and v3.16b, v3.16b, v16.16b +; CHECK-NEXT: and v22.16b, v22.16b, v16.16b ; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: add v24.4s, v0.4s, v18.4s -; CHECK-NEXT: ushr v0.4s, v21.4s, #16 -; CHECK-NEXT: add v3.4s, v3.4s, v18.4s -; CHECK-NEXT: add v26.4s, v1.4s, v18.4s -; CHECK-NEXT: add v27.4s, v2.4s, v18.4s -; CHECK-NEXT: and v17.16b, v17.16b, v16.16b -; CHECK-NEXT: and v28.16b, v0.16b, v16.16b +; CHECK-NEXT: and v24.16b, v24.16b, v16.16b ; CHECK-NEXT: and v16.16b, v25.16b, v16.16b -; CHECK-NEXT: addhn v0.4h, v4.4s, v3.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v24.4s -; CHECK-NEXT: add v4.4s, v17.4s, v18.4s -; CHECK-NEXT: addhn v2.4h, v6.4s, v26.4s -; CHECK-NEXT: add v5.4s, v23.4s, v18.4s -; CHECK-NEXT: addhn v3.4h, v7.4s, v27.4s -; CHECK-NEXT: add v6.4s, v28.4s, v18.4s -; CHECK-NEXT: add v16.4s, v16.4s, v18.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v4.4s -; CHECK-NEXT: addhn2 v1.8h, v20.4s, v5.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v6.4s -; CHECK-NEXT: addhn2 v3.8h, v22.4s, v16.4s +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v1.4s, v1.4s, v21.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: addhn v0.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v1.4h, v4.4s, v1.4s +; CHECK-NEXT: addhn v2.4h, v6.4s, v2.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v3.4s +; CHECK-NEXT: add v4.4s, v22.4s, v21.4s +; CHECK-NEXT: add v5.4s, v23.4s, v21.4s +; CHECK-NEXT: add v6.4s, v24.4s, v21.4s +; CHECK-NEXT: add v7.4s, v16.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v17.4s, v4.4s +; CHECK-NEXT: addhn2 v1.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v6.4s +; CHECK-NEXT: addhn2 v3.8h, v20.4s, v7.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i16> %a to <32 x bfloat> @@ -7768,9 +7768,9 @@ define <3 x bfloat> @stofp_v3i8_v3bf16(<3 x i8> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <3 x i8> %a to <3 x bfloat> @@ -7789,9 +7789,9 @@ define <3 x bfloat> @utofp_v3i8_v3bf16(<3 x i8> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <3 x i8> %a to <3 x bfloat> @@ -7808,9 +7808,9 @@ define <4 x bfloat> @stofp_v4i8_v4bf16(<4 x i8> %a) { ; CHECK-NEXT: scvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = sitofp <4 x i8> %a to <4 x bfloat> @@ -7826,9 +7826,9 @@ define <4 x bfloat> @utofp_v4i8_v4bf16(<4 x i8> %a) { ; CHECK-NEXT: ucvtf v0.4s, v0.4s ; CHECK-NEXT: ushr v2.4s, v0.4s, #16 ; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #127, msl #8 ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s -; CHECK-NEXT: movi v1.4s, #127, msl #8 -; CHECK-NEXT: addhn v0.4h, v0.4s, v1.4s +; CHECK-NEXT: addhn v0.4h, v0.4s, v2.4s ; CHECK-NEXT: ret entry: %c = uitofp <4 x i8> %a to <4 x bfloat> @@ -7909,11 +7909,11 @@ define <16 x bfloat> @stofp_v16i8_v16bf16(<16 x i8> %a) { ; CHECK-NEXT: add v5.4s, v5.4s, v7.4s ; CHECK-NEXT: add v0.4s, v0.4s, v7.4s ; CHECK-NEXT: addhn v1.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: add v5.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn v0.4h, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: add v4.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn2 v1.8h, v2.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v6.4s, v4.4s ; CHECK-NEXT: ret entry: %c = sitofp <16 x i8> %a to <16 x bfloat> @@ -7946,11 +7946,11 @@ define <16 x bfloat> @utofp_v16i8_v16bf16(<16 x i8> %a) { ; CHECK-NEXT: add v5.4s, v5.4s, v7.4s ; CHECK-NEXT: add v0.4s, v0.4s, v7.4s ; CHECK-NEXT: addhn v1.4h, v3.4s, v5.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: add v5.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn v0.4h, v4.4s, v0.4s +; CHECK-NEXT: add v3.4s, v16.4s, v7.4s +; CHECK-NEXT: add v4.4s, v17.4s, v7.4s ; CHECK-NEXT: addhn2 v1.8h, v2.4s, v3.4s -; CHECK-NEXT: addhn2 v0.8h, v6.4s, v5.4s +; CHECK-NEXT: addhn2 v0.8h, v6.4s, v4.4s ; CHECK-NEXT: ret entry: %c = uitofp <16 x i8> %a to <16 x bfloat> @@ -7961,14 +7961,14 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-LABEL: stofp_v32i8_v32bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: movi v20.4s, #127, msl #8 +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: sshll v5.4s, v3.4h, #0 -; CHECK-NEXT: sshll v6.4s, v4.4h, #0 -; CHECK-NEXT: sshll v7.4s, v0.4h, #0 +; CHECK-NEXT: sshll v6.4s, v0.4h, #0 +; CHECK-NEXT: sshll v7.4s, v4.4h, #0 ; CHECK-NEXT: sshll v16.4s, v1.4h, #0 ; CHECK-NEXT: sshll2 v3.4s, v3.8h, #0 ; CHECK-NEXT: sshll2 v4.4s, v4.8h, #0 @@ -7980,40 +7980,40 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-NEXT: scvtf v16.4s, v16.4s ; CHECK-NEXT: scvtf v17.4s, v3.4s ; CHECK-NEXT: scvtf v4.4s, v4.4s -; CHECK-NEXT: scvtf v19.4s, v0.4s -; CHECK-NEXT: scvtf v21.4s, v1.4s -; CHECK-NEXT: ushr v3.4s, v5.4s, #16 -; CHECK-NEXT: ushr v18.4s, v6.4s, #16 -; CHECK-NEXT: ushr v0.4s, v7.4s, #16 -; CHECK-NEXT: ushr v1.4s, v16.4s, #16 -; CHECK-NEXT: ushr v22.4s, v17.4s, #16 -; CHECK-NEXT: ushr v23.4s, v4.4s, #16 -; CHECK-NEXT: ushr v24.4s, v19.4s, #16 -; CHECK-NEXT: ushr v25.4s, v21.4s, #16 -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: scvtf v18.4s, v0.4s +; CHECK-NEXT: scvtf v19.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v5.4s, #16 +; CHECK-NEXT: ushr v3.4s, v6.4s, #16 +; CHECK-NEXT: ushr v1.4s, v7.4s, #16 +; CHECK-NEXT: ushr v20.4s, v16.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v4.4s, #16 +; CHECK-NEXT: ushr v22.4s, v18.4s, #16 +; CHECK-NEXT: ushr v25.4s, v19.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b ; CHECK-NEXT: and v23.16b, v23.16b, v2.16b ; CHECK-NEXT: and v24.16b, v24.16b, v2.16b -; CHECK-NEXT: and v2.16b, v25.16b, v2.16b -; CHECK-NEXT: add v3.4s, v3.4s, v20.4s -; CHECK-NEXT: add v18.4s, v18.4s, v20.4s -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s -; CHECK-NEXT: add v26.4s, v1.4s, v20.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v3.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn v0.4h, v7.4s, v0.4s -; CHECK-NEXT: add v5.4s, v22.4s, v20.4s -; CHECK-NEXT: add v6.4s, v24.4s, v20.4s -; CHECK-NEXT: add v7.4s, v23.4s, v20.4s -; CHECK-NEXT: add v18.4s, v2.4s, v20.4s -; CHECK-NEXT: addhn v2.4h, v16.4s, v26.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v17.4s, v5.4s +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v25.16b, v25.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: add v26.4s, v1.4s, v21.4s +; CHECK-NEXT: add v20.4s, v20.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v0.4h, v6.4s, v3.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v26.4s +; CHECK-NEXT: addhn v2.4h, v16.4s, v20.4s +; CHECK-NEXT: add v5.4s, v22.4s, v21.4s +; CHECK-NEXT: add v6.4s, v23.4s, v21.4s +; CHECK-NEXT: add v7.4s, v24.4s, v21.4s +; CHECK-NEXT: add v16.4s, v25.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v1.8h, v17.4s, v6.4s ; CHECK-NEXT: addhn2 v3.8h, v4.4s, v7.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v18.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v16.4s ; CHECK-NEXT: ret entry: %c = sitofp <32 x i8> %a to <32 x bfloat> @@ -8024,14 +8024,14 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-LABEL: utofp_v32i8_v32bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 -; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: movi v2.4s, #1 -; CHECK-NEXT: movi v20.4s, #127, msl #8 +; CHECK-NEXT: movi v21.4s, #127, msl #8 ; CHECK-NEXT: ushll v5.4s, v3.4h, #0 -; CHECK-NEXT: ushll v6.4s, v4.4h, #0 -; CHECK-NEXT: ushll v7.4s, v0.4h, #0 +; CHECK-NEXT: ushll v6.4s, v0.4h, #0 +; CHECK-NEXT: ushll v7.4s, v4.4h, #0 ; CHECK-NEXT: ushll v16.4s, v1.4h, #0 ; CHECK-NEXT: ushll2 v3.4s, v3.8h, #0 ; CHECK-NEXT: ushll2 v4.4s, v4.8h, #0 @@ -8043,40 +8043,40 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) { ; CHECK-NEXT: ucvtf v16.4s, v16.4s ; CHECK-NEXT: ucvtf v17.4s, v3.4s ; CHECK-NEXT: ucvtf v4.4s, v4.4s -; CHECK-NEXT: ucvtf v19.4s, v0.4s -; CHECK-NEXT: ucvtf v21.4s, v1.4s -; CHECK-NEXT: ushr v3.4s, v5.4s, #16 -; CHECK-NEXT: ushr v18.4s, v6.4s, #16 -; CHECK-NEXT: ushr v0.4s, v7.4s, #16 -; CHECK-NEXT: ushr v1.4s, v16.4s, #16 -; CHECK-NEXT: ushr v22.4s, v17.4s, #16 -; CHECK-NEXT: ushr v23.4s, v4.4s, #16 -; CHECK-NEXT: ushr v24.4s, v19.4s, #16 -; CHECK-NEXT: ushr v25.4s, v21.4s, #16 -; CHECK-NEXT: and v3.16b, v3.16b, v2.16b -; CHECK-NEXT: and v18.16b, v18.16b, v2.16b +; CHECK-NEXT: ucvtf v18.4s, v0.4s +; CHECK-NEXT: ucvtf v19.4s, v1.4s +; CHECK-NEXT: ushr v0.4s, v5.4s, #16 +; CHECK-NEXT: ushr v3.4s, v6.4s, #16 +; CHECK-NEXT: ushr v1.4s, v7.4s, #16 +; CHECK-NEXT: ushr v20.4s, v16.4s, #16 +; CHECK-NEXT: ushr v23.4s, v17.4s, #16 +; CHECK-NEXT: ushr v24.4s, v4.4s, #16 +; CHECK-NEXT: ushr v22.4s, v18.4s, #16 +; CHECK-NEXT: ushr v25.4s, v19.4s, #16 ; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v3.16b, v3.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v20.16b, v20.16b, v2.16b ; CHECK-NEXT: and v23.16b, v23.16b, v2.16b ; CHECK-NEXT: and v24.16b, v24.16b, v2.16b -; CHECK-NEXT: and v2.16b, v25.16b, v2.16b -; CHECK-NEXT: add v3.4s, v3.4s, v20.4s -; CHECK-NEXT: add v18.4s, v18.4s, v20.4s -; CHECK-NEXT: add v0.4s, v0.4s, v20.4s -; CHECK-NEXT: add v26.4s, v1.4s, v20.4s -; CHECK-NEXT: addhn v1.4h, v5.4s, v3.4s -; CHECK-NEXT: addhn v3.4h, v6.4s, v18.4s -; CHECK-NEXT: addhn v0.4h, v7.4s, v0.4s -; CHECK-NEXT: add v5.4s, v22.4s, v20.4s -; CHECK-NEXT: add v6.4s, v24.4s, v20.4s -; CHECK-NEXT: add v7.4s, v23.4s, v20.4s -; CHECK-NEXT: add v18.4s, v2.4s, v20.4s -; CHECK-NEXT: addhn v2.4h, v16.4s, v26.4s -; CHECK-NEXT: addhn2 v0.8h, v19.4s, v6.4s -; CHECK-NEXT: addhn2 v1.8h, v17.4s, v5.4s +; CHECK-NEXT: and v22.16b, v22.16b, v2.16b +; CHECK-NEXT: and v25.16b, v25.16b, v2.16b +; CHECK-NEXT: add v0.4s, v0.4s, v21.4s +; CHECK-NEXT: add v3.4s, v3.4s, v21.4s +; CHECK-NEXT: add v26.4s, v1.4s, v21.4s +; CHECK-NEXT: add v20.4s, v20.4s, v21.4s +; CHECK-NEXT: addhn v1.4h, v5.4s, v0.4s +; CHECK-NEXT: addhn v0.4h, v6.4s, v3.4s +; CHECK-NEXT: addhn v3.4h, v7.4s, v26.4s +; CHECK-NEXT: addhn v2.4h, v16.4s, v20.4s +; CHECK-NEXT: add v5.4s, v22.4s, v21.4s +; CHECK-NEXT: add v6.4s, v23.4s, v21.4s +; CHECK-NEXT: add v7.4s, v24.4s, v21.4s +; CHECK-NEXT: add v16.4s, v25.4s, v21.4s +; CHECK-NEXT: addhn2 v0.8h, v18.4s, v5.4s +; CHECK-NEXT: addhn2 v1.8h, v17.4s, v6.4s ; CHECK-NEXT: addhn2 v3.8h, v4.4s, v7.4s -; CHECK-NEXT: addhn2 v2.8h, v21.4s, v18.4s +; CHECK-NEXT: addhn2 v2.8h, v19.4s, v16.4s ; CHECK-NEXT: ret entry: %c = uitofp <32 x i8> %a to <32 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll index 4b491051a88aa..ba04ba1d7bb6a 100644 --- a/llvm/test/CodeGen/AArch64/ldexp.ll +++ b/llvm/test/CodeGen/AArch64/ldexp.ll @@ -4,9 +4,9 @@ define double @testExp(double %val, i32 %a) { ; CHECK-LABEL: testExp: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: sxtw x8, w0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: fmov d1, x8 ; CHECK-NEXT: fscale z0.d, p0/m, z0.d, z1.d @@ -22,8 +22,8 @@ declare double @ldexp(double, i32) memory(none) define float @testExpf(float %val, i32 %a) { ; CHECK-LABEL: testExpf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 ; CHECK-NEXT: fscale z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -49,9 +49,9 @@ declare fp128 @ldexpl(fp128, i32) memory(none) define half @testExpf16(half %val, i32 %a) { ; CHECK-LABEL: testExpf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcvt s0, h0 ; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fscale z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fcvt h0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll index ab15bf564ec42..59a460923e8b7 100644 --- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll +++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll @@ -10,9 +10,9 @@ define @sdiv_i8( %a, %b) ; CHECK: // %bb.0: ; CHECK-NEXT: sunpkhi z2.h, z1.b ; CHECK-NEXT: sunpkhi z3.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: sunpklo z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z4.s, z2.h ; CHECK-NEXT: sunpkhi z5.s, z3.h ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -36,11 +36,11 @@ define @sdiv_i8( %a, %b) define @sdiv_i16( %a, %b) { ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h @@ -140,9 +140,9 @@ define @srem_i8( %a, %b) define @srem_i16( %a, %b) { ; CHECK-LABEL: srem_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z2.s, z1.h ; CHECK-NEXT: sunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z4.s, z0.h ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: sunpklo z3.s, z1.h @@ -188,9 +188,9 @@ define @udiv_i8( %a, %b) ; CHECK: // %bb.0: ; CHECK-NEXT: uunpkhi z2.h, z1.b ; CHECK-NEXT: uunpkhi z3.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z4.s, z2.h ; CHECK-NEXT: uunpkhi z5.s, z3.h ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -214,11 +214,11 @@ define @udiv_i8( %a, %b) define @udiv_i16( %a, %b) { ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h @@ -261,9 +261,9 @@ define @udiv_split_i32( %a, @udiv_widen_i32( %a, %b) { ; CHECK-LABEL: udiv_widen_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: udiv z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %div = udiv %a, %b @@ -319,9 +319,9 @@ define @urem_i8( %a, %b) define @urem_i16( %a, %b) { ; CHECK-LABEL: urem_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z2.s, z1.h ; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z4.s, z0.h ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: uunpklo z3.s, z1.h @@ -558,9 +558,9 @@ define @umin_split_i64( %a, @umin_promote_i8( %a, %b) { ; CHECK-LABEL: umin_promote_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %cmp = icmp ult %a, %b @@ -704,9 +704,9 @@ define @umax_split_i16( %a, @umax_promote_i32( %a, %b) { ; CHECK-LABEL: umax_promote_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %cmp = icmp ugt %a, %b @@ -883,8 +883,8 @@ define @lsl_split_i64( %a, @lsl_promote_i16( %a, %b){ ; CHECK-LABEL: lsl_promote_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %shl = shl %a, %b @@ -982,9 +982,9 @@ define @lsr_i64( %a, %b) define @lsr_promote_i8( %a, %b){ ; CHECK-LABEL: lsr_promote_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %shr = lshr %a, %b @@ -1081,10 +1081,10 @@ declare @llvm.fshr.nxv2i64(, @fshl_i64( %a, %b, %c){ ; CHECK-LABEL: fshl_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, #63 // =0x3f ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: lsr z1.d, z1.d, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: bic z2.d, z3.d, z2.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d @@ -1098,17 +1098,16 @@ define @fshl_i64( %a, %b define @fshl_illegal_i64( %a, %b, %c){ ; CHECK-LABEL: fshl_illegal_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z6.d, #63 // =0x3f -; CHECK-NEXT: mov z7.d, z4.d ; CHECK-NEXT: lsr z2.d, z2.d, #1 ; CHECK-NEXT: lsr z3.d, z3.d, #1 -; CHECK-NEXT: bic z4.d, z6.d, z4.d -; CHECK-NEXT: and z7.d, z7.d, #0x3f +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: bic z7.d, z6.d, z4.d +; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: bic z6.d, z6.d, z5.d ; CHECK-NEXT: and z5.d, z5.d, #0x3f -; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z7.d -; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z4.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z4.d +; CHECK-NEXT: lsr z2.d, p0/m, z2.d, z7.d ; CHECK-NEXT: lsr z3.d, p0/m, z3.d, z6.d ; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z5.d ; CHECK-NEXT: orr z0.d, z0.d, z2.d @@ -1121,9 +1120,9 @@ define @fshl_illegal_i64( %a, @fshl_rot_i64( %a, %b){ ; CHECK-LABEL: fshl_rot_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f ; CHECK-NEXT: lslr z2.d, p0/m, z2.d, z0.d @@ -1138,11 +1137,11 @@ define @fshl_rot_i64( %a, @fshl_rot_illegal_i64( %a, %b){ ; CHECK-LABEL: fshl_rot_illegal_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: subr z2.d, z2.d, #0 // =0x0 ; CHECK-NEXT: mov z5.d, z3.d ; CHECK-NEXT: subr z3.d, z3.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z5.d, z5.d, #0x3f @@ -1175,10 +1174,10 @@ define @fshl_rot_const_i64( %a){ define @fshr_i64( %a, %b, %c){ ; CHECK-LABEL: fshr_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z3.d, #63 // =0x3f ; CHECK-NEXT: mov z4.d, z2.d ; CHECK-NEXT: lsl z0.d, z0.d, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: bic z2.d, z3.d, z2.d ; CHECK-NEXT: and z4.d, z4.d, #0x3f ; CHECK-NEXT: lsr z1.d, p0/m, z1.d, z4.d @@ -1192,9 +1191,9 @@ define @fshr_i64( %a, %b define @fshr_rot_i64( %a, %b){ ; CHECK-LABEL: fshr_rot_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: subr z1.d, z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z2.d, z2.d, #0x3f ; CHECK-NEXT: and z1.d, z1.d, #0x3f ; CHECK-NEXT: lsrr z2.d, p0/m, z2.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll index 993af08a66ddd..23d545459295f 100644 --- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll +++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll @@ -469,18 +469,18 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef ; CHECK-NEXT: lsr w8, w8, #24 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b -; CHECK-NEXT: str s1, [x0] ; CHECK-NEXT: add v0.8h, v0.8h, v3.8h ; CHECK-NEXT: dup v3.8b, w8 +; CHECK-NEXT: str s1, [x0] ; CHECK-NEXT: lsl x8, x1, #1 ; CHECK-NEXT: rshrn v0.8b, v0.8h, #2 ; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s ; CHECK-NEXT: str s0, [x0, x1] ; CHECK-NEXT: zip1 v3.2s, v0.2s, v3.2s ; CHECK-NEXT: ext v2.8b, v2.8b, v0.8b, #1 +; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1 ; CHECK-NEXT: str s2, [x0, x8] ; CHECK-NEXT: add x8, x8, x1 -; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1 ; CHECK-NEXT: str s1, [x0, x8] ; CHECK-NEXT: ret %5 = load i32, ptr %2, align 4 @@ -608,9 +608,9 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun define @loadnxv8i8(ptr %p) { ; CHECK-LABEL: loadnxv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret %l = load i8, ptr %p @@ -631,9 +631,9 @@ define @loadnxv16i8(ptr %p) { define @loadnxv4i16(ptr %p) { ; CHECK-LABEL: loadnxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: ldrh w8, [x0] +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret %l = load i16, ptr %p @@ -654,9 +654,9 @@ define @loadnxv8i16(ptr %p) { define @loadnxv2i32(ptr %p) { ; CHECK-LABEL: loadnxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %l = load i32, ptr %p @@ -688,9 +688,9 @@ define @loadnxv2i64(ptr %p) { define @loadnxv4f16(ptr %p) { ; CHECK-LABEL: loadnxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -715,9 +715,9 @@ define @loadnxv8f16(ptr %p) { define @loadnxv4bf16(ptr %p) { ; CHECK-LABEL: loadnxv4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -742,9 +742,9 @@ define @loadnxv8bf16(ptr %p) { define @loadnxv2f32(ptr %p) { ; CHECK-LABEL: loadnxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.s, #0 // =0x0 @@ -782,9 +782,9 @@ define @loadnxv2f64(ptr %p) { define @loadnxv8i8_offset(ptr %p) { ; CHECK-LABEL: loadnxv8i8_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, #0 // =0x0 ; CHECK-NEXT: ldrb w8, [x0, #1] +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, w8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -807,9 +807,9 @@ define @loadnxv16i8_offset(ptr %p) { define @loadnxv4i16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4i16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, #0 // =0x0 ; CHECK-NEXT: ldurh w8, [x0, #1] +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, w8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -832,9 +832,9 @@ define @loadnxv8i16_offset(ptr %p) { define @loadnxv2i32_offset(ptr %p) { ; CHECK-LABEL: loadnxv2i32_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: ldur w8, [x0, #1] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %g = getelementptr inbounds i8, ptr %p, i64 1 @@ -869,9 +869,9 @@ define @loadnxv2i64_offset(ptr %p) { define @loadnxv4f16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4f16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -898,9 +898,9 @@ define @loadnxv8f16_offset(ptr %p) { define @loadnxv4bf16_offset(ptr %p) { ; CHECK-LABEL: loadnxv4bf16_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.h, #0 // =0x0 @@ -927,9 +927,9 @@ define @loadnxv8bf16_offset(ptr %p) { define @loadnxv2f32_offset(ptr %p) { ; CHECK-LABEL: loadnxv2f32_offset: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.s, #0 // =0x0 diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll index 39f82dd4593fb..31047954401cf 100644 --- a/llvm/test/CodeGen/AArch64/logic-shift.ll +++ b/llvm/test/CodeGen/AArch64/logic-shift.ll @@ -34,9 +34,9 @@ define i32 @or_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: or_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -49,9 +49,9 @@ define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, < define <2 x i64> @or_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: or_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -94,9 +94,9 @@ define i64 @or_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: or_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -109,9 +109,9 @@ define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, < define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: or_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y @@ -262,9 +262,9 @@ define i32 @xor_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: xor_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -277,9 +277,9 @@ define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, define <2 x i64> @xor_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: xor_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -322,9 +322,9 @@ define i64 @xor_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: xor_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -337,9 +337,9 @@ define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: xor_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: eor v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y @@ -490,9 +490,9 @@ define i32 @and_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) { define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) { ; CHECK-LABEL: and_lshr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.8h, v2.8h ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.8h, v0.8h, v2.8h +; CHECK-NEXT: neg v1.8h, v2.8h +; CHECK-NEXT: ushl v0.8h, v0.8h, v1.8h ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <8 x i16> %x0, %y @@ -505,9 +505,9 @@ define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, define <2 x i64> @and_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) { ; CHECK-LABEL: and_lshr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.2d, v2.2d ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushl v0.2d, v0.2d, v2.2d +; CHECK-NEXT: neg v1.2d, v2.2d +; CHECK-NEXT: ushl v0.2d, v0.2d, v1.2d ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = lshr <2 x i64> %x0, %y @@ -550,9 +550,9 @@ define i64 @and_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) { define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) { ; CHECK-LABEL: and_ashr_commute2: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.4s, v2.4s ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-NEXT: neg v1.4s, v2.4s +; CHECK-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <4 x i32> %x0, %y @@ -565,9 +565,9 @@ define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) { ; CHECK-LABEL: and_ashr_commute3: ; CHECK: // %bb.0: -; CHECK-NEXT: neg v2.16b, v2.16b ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshl v0.16b, v0.16b, v2.16b +; CHECK-NEXT: neg v1.16b, v2.16b +; CHECK-NEXT: sshl v0.16b, v0.16b, v1.16b ; CHECK-NEXT: and v0.16b, v0.16b, v3.16b ; CHECK-NEXT: ret %sh1 = ashr <16 x i8> %x0, %y diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll index 06570b4539cc1..fac96e07de541 100644 --- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll +++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll @@ -258,10 +258,10 @@ define @splice_nxv2i1_idx( %a, @llvm.experimental.vector.splice.nxv2i1( %a, %b, i32 1) ret %res @@ -273,10 +273,10 @@ define @splice_nxv4i1_idx( %a, @llvm.experimental.vector.splice.nxv4i1( %a, %b, i32 2) ret %res @@ -288,10 +288,10 @@ define @splice_nxv8i1_idx( %a, @llvm.experimental.vector.splice.nxv8i1( %a, %b, i32 4) ret %res @@ -303,10 +303,10 @@ define @splice_nxv16i1_idx( %a, @llvm.experimental.vector.splice.nxv16i1( %a, %b, i32 8) ret %res @@ -350,16 +350,16 @@ define @splice_nxv16f32_16( %a, @splice_nxv16i8_neg17( %a, @splice_nxv8i16_neg9( %a, @splice_nxv8f16_neg9( %a, @splice_nxv2i1( %a, ; CHECK-NEXT: ptrue p2.d, vl1 ; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.d, p2.d +; CHECK-NEXT: splice z1.d, p0, z1.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: rev p2.d, p2.d -; CHECK-NEXT: splice z1.d, p2, z1.d, z0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ret @@ -716,9 +716,9 @@ define @splice_nxv4i1( %a, ; CHECK-NEXT: ptrue p2.s, vl1 ; CHECK-NEXT: mov z0.s, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.s, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.s, p2.s +; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: rev p2.s, p2.s -; CHECK-NEXT: splice z1.s, p2, z1.s, z0.s ; CHECK-NEXT: and z1.s, z1.s, #0x1 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: ret @@ -733,9 +733,9 @@ define @splice_nxv8i1( %a, ; CHECK-NEXT: ptrue p2.h, vl1 ; CHECK-NEXT: mov z0.h, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.h, p0/z, #1 // =0x1 +; CHECK-NEXT: rev p0.h, p2.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: rev p2.h, p2.h -; CHECK-NEXT: splice z1.h, p2, z1.h, z0.h ; CHECK-NEXT: and z1.h, z1.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 ; CHECK-NEXT: ret @@ -750,9 +750,9 @@ define @splice_nxv16i1( %a, @splice_nxv8i32( %a, @splice_nxv16f32_neg17( %a, %a, <24 x i8> %b, <24 x i8> %c, <24 ; CHECK-NEXT: ld1 { v5.b }[15], [x8] ; CHECK-NEXT: ld1 { v7.b }[7], [x10] ; CHECK-NEXT: addp v1.2s, v19.2s, v19.2s +; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: sdot v16.4s, v5.16b, v4.16b ; CHECK-NEXT: sdot v18.2s, v7.8b, v6.8b -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: addv s2, v16.4s ; CHECK-NEXT: addp v3.2s, v18.2s, v18.2s @@ -975,8 +975,8 @@ define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> % ; CHECK-NEXT: addv s3, v5.4s ; CHECK-NEXT: addp v1.2s, v17.2s, v17.2s ; CHECK-NEXT: addp v2.2s, v7.2s, v7.2s -; CHECK-NEXT: addv s0, v6.4s ; CHECK-NEXT: fmov w10, s3 +; CHECK-NEXT: addv s0, v6.4s ; CHECK-NEXT: fmov w9, s1 ; CHECK-NEXT: fmov w11, s2 ; CHECK-NEXT: fmov w8, s0 @@ -998,26 +998,26 @@ entry: define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q4, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ldp q5, q1, [x1] -; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 -; CHECK-NEXT: ushll v6.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: ushll v6.8h, v2.8b, #0 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: ushll v7.8h, v5.8b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: umull v3.4s, v4.4h, v3.4h +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: umull2 v16.4s, v7.8h, v6.8h ; CHECK-NEXT: umull v6.4s, v7.4h, v6.4h -; CHECK-NEXT: mov v3.s[0], v2.s[0] -; CHECK-NEXT: ushll2 v2.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v4.8h, v5.16b, #0 -; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: mov v4.s[0], v3.s[0] +; CHECK-NEXT: ushll2 v3.8h, v5.16b, #0 ; CHECK-NEXT: umlal2 v16.4s, v1.8h, v0.8h -; CHECK-NEXT: umlal v3.4s, v4.4h, v2.4h -; CHECK-NEXT: umlal2 v16.4s, v4.8h, v2.8h -; CHECK-NEXT: add v0.4s, v6.4s, v3.4s +; CHECK-NEXT: umlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: umlal v4.4s, v3.4h, v2.4h +; CHECK-NEXT: umlal2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: add v0.4s, v6.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1039,17 +1039,17 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ldp q2, q1, [x0] ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-NEXT: ushll v4.8h, v2.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v3.8h -; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddl v3.4s, v4.4h, v3.4h -; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: uaddw2 v1.4s, v5.4s, v2.8h +; CHECK-NEXT: ushll v3.4s, v3.4h, #0 +; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v1.8h +; CHECK-NEXT: uaddl v1.4s, v4.4h, v1.4h +; CHECK-NEXT: mov v0.s[0], v3.s[0] +; CHECK-NEXT: uaddw2 v3.4s, v5.4s, v2.8h +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s ; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1063,26 +1063,26 @@ entry: define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q4, q0, [x0] +; CHECK-NEXT: ldp q2, q0, [x0] ; CHECK-NEXT: ldp q5, q1, [x1] -; CHECK-NEXT: sshll2 v2.8h, v0.16b, #0 -; CHECK-NEXT: sshll v6.8h, v4.8b, #0 +; CHECK-NEXT: sshll2 v3.8h, v0.16b, #0 +; CHECK-NEXT: sshll v6.8h, v2.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 ; CHECK-NEXT: sshll v7.8h, v5.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: smull v2.4s, v3.4h, v2.4h -; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: smull v3.4s, v4.4h, v3.4h +; CHECK-NEXT: movi v4.2d, #0000000000000000 ; CHECK-NEXT: smull2 v16.4s, v7.8h, v6.8h ; CHECK-NEXT: smull v6.4s, v7.4h, v6.4h -; CHECK-NEXT: mov v3.s[0], v2.s[0] -; CHECK-NEXT: sshll2 v2.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v4.8h, v5.16b, #0 -; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: mov v4.s[0], v3.s[0] +; CHECK-NEXT: sshll2 v3.8h, v5.16b, #0 ; CHECK-NEXT: smlal2 v16.4s, v1.8h, v0.8h -; CHECK-NEXT: smlal v3.4s, v4.4h, v2.4h -; CHECK-NEXT: smlal2 v16.4s, v4.8h, v2.8h -; CHECK-NEXT: add v0.4s, v6.4s, v3.4s +; CHECK-NEXT: smlal v6.4s, v1.4h, v0.4h +; CHECK-NEXT: smlal v4.4s, v3.4h, v2.4h +; CHECK-NEXT: smlal2 v16.4s, v3.8h, v2.8h +; CHECK-NEXT: add v0.4s, v6.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1105,222 +1105,222 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: fmov s4, w0 -; CHECK-NEXT: ldr b0, [sp, #80] -; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b1, [sp, #16] -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ldr b2, [sp, #280] -; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: ldr b0, [sp, #80] +; CHECK-NEXT: add x11, sp, #24 ; CHECK-NEXT: ldr b3, [sp, #216] +; CHECK-NEXT: add x10, sp, #88 +; CHECK-NEXT: ldr b2, [sp, #280] +; CHECK-NEXT: ld1 { v1.b }[1], [x11] ; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: mov v4.b[1], w1 -; CHECK-NEXT: ld1 { v1.b }[1], [x10] +; CHECK-NEXT: ldr b4, [sp, #152] +; CHECK-NEXT: ldr b6, [sp, #480] +; CHECK-NEXT: ld1 { v0.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #288 -; CHECK-NEXT: ldr b5, [sp, #152] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x12, sp, #160 ; CHECK-NEXT: ld1 { v3.b }[1], [x11] -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x9] -; CHECK-NEXT: ld1 { v5.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #32 -; CHECK-NEXT: add x11, sp, #296 -; CHECK-NEXT: mov v4.b[2], w2 -; CHECK-NEXT: ld1 { v1.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #232 +; CHECK-NEXT: add x11, sp, #488 +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: ld1 { v4.b }[1], [x12] +; CHECK-NEXT: ld1 { v6.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #32 +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x11] -; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: ld1 { v1.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #232 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #296 +; CHECK-NEXT: ld1 { v3.b }[2], [x11] ; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: ld1 { v4.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #40 +; CHECK-NEXT: ld1 { v1.b }[3], [x11] ; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] -; CHECK-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #240 -; CHECK-NEXT: mov v4.b[3], w3 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: add x12, sp, #112 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v5.b }[3], [x8] -; CHECK-NEXT: ld1 { v0.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #184 -; CHECK-NEXT: ld1 { v1.b }[4], [x13] -; CHECK-NEXT: add x15, sp, #56 -; CHECK-NEXT: add x14, sp, #128 -; CHECK-NEXT: mov v4.b[4], w4 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: add x13, sp, #256 -; CHECK-NEXT: ld1 { v5.b }[4], [x12] -; CHECK-NEXT: ld1 { v0.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: add x12, sp, #248 -; CHECK-NEXT: ld1 { v1.b }[5], [x15] -; CHECK-NEXT: add x15, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[4], [x12] -; CHECK-NEXT: ld1 { v2.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #304 +; CHECK-NEXT: add x10, sp, #112 +; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: add x13, sp, #56 +; CHECK-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v3.b }[3], [x11] +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x15, sp, #312 +; CHECK-NEXT: add x12, sp, #120 +; CHECK-NEXT: add x8, sp, #248 ; CHECK-NEXT: add x11, sp, #64 -; CHECK-NEXT: mov v4.b[5], w5 -; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: ld1 { v0.b }[6], [x14] -; CHECK-NEXT: ldr b6, [sp, #352] -; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: ld1 { v1.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ld1 { v3.b }[5], [x13] +; CHECK-NEXT: ld1 { v2.b }[4], [x15] +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x15, sp, #320 +; CHECK-NEXT: ld1 { v1.b }[5], [x13] +; CHECK-NEXT: ld1 { v0.b }[5], [x12] ; CHECK-NEXT: ldr b18, [sp, #552] -; CHECK-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-NEXT: add x14, sp, #208 -; CHECK-NEXT: ld1 { v6.b }[1], [x11] -; CHECK-NEXT: mov v4.b[6], w6 -; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #560 -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: add x14, sp, #128 +; CHECK-NEXT: add x16, sp, #256 +; CHECK-NEXT: ldr b16, [sp, #352] +; CHECK-NEXT: ld1 { v2.b }[5], [x15] +; CHECK-NEXT: add x15, sp, #176 +; CHECK-NEXT: ld1 { v3.b }[5], [x16] +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #560 +; CHECK-NEXT: ld1 { v0.b }[6], [x14] +; CHECK-NEXT: add x16, sp, #360 +; CHECK-NEXT: ld1 { v4.b }[3], [x15] +; CHECK-NEXT: ld1 { v18.b }[1], [x11] +; CHECK-NEXT: add x10, sp, #72 +; CHECK-NEXT: ld1 { v16.b }[1], [x16] +; CHECK-NEXT: add x9, sp, #136 +; CHECK-NEXT: add x14, sp, #184 +; CHECK-NEXT: ld1 { v1.b }[7], [x10] ; CHECK-NEXT: add x10, sp, #568 -; CHECK-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: ld1 { v4.b }[4], [x14] ; CHECK-NEXT: add x9, sp, #368 -; CHECK-NEXT: ld1 { v6.b }[2], [x9] -; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: ldr b7, [sp, #144] -; CHECK-NEXT: mov v4.b[7], w7 ; CHECK-NEXT: ld1 { v18.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #376 -; CHECK-NEXT: sshll v17.8h, v5.8b, #0 -; CHECK-NEXT: ldr b5, [sp, #480] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: add x11, sp, #496 +; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: fmov s5, w0 +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: ld1 { v6.b }[2], [x11] ; CHECK-NEXT: add x10, sp, #576 -; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v5.b }[1], [x11] +; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #376 ; CHECK-NEXT: ld1 { v18.b }[3], [x10] -; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: sshll v16.8h, v4.8b, #0 -; CHECK-NEXT: ldr b4, [sp, #344] -; CHECK-NEXT: add x10, sp, #384 -; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x11, sp, #504 +; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: mov v5.b[1], w1 +; CHECK-NEXT: ldr b7, [sp, #144] +; CHECK-NEXT: ldr b17, [sp, #344] +; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: ld1 { v6.b }[3], [x11] ; CHECK-NEXT: add x10, sp, #584 -; CHECK-NEXT: ld1 { v2.b }[4], [x8] -; CHECK-NEXT: sshll v19.8h, v4.8b, #0 -; CHECK-NEXT: ld1 { v5.b }[2], [x11] +; CHECK-NEXT: ld1 { v4.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #384 ; CHECK-NEXT: ld1 { v18.b }[4], [x10] -; CHECK-NEXT: smull2 v4.4s, v16.8h, v17.8h -; CHECK-NEXT: smull v16.4s, v16.4h, v17.4h -; CHECK-NEXT: ldr b17, [sp, #416] -; CHECK-NEXT: add x11, sp, #504 -; CHECK-NEXT: add x10, sp, #424 -; CHECK-NEXT: add x16, sp, #320 -; CHECK-NEXT: smull v19.4s, v7.4h, v19.4h -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v5.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #392 -; CHECK-NEXT: ld1 { v17.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #592 -; CHECK-NEXT: ld1 { v2.b }[5], [x16] -; CHECK-NEXT: ld1 { v6.b }[5], [x11] -; CHECK-NEXT: ld1 { v18.b }[5], [x10] +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 ; CHECK-NEXT: add x11, sp, #512 -; CHECK-NEXT: add x10, sp, #432 +; CHECK-NEXT: ld1 { v16.b }[4], [x9] +; CHECK-NEXT: ld1 { v6.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #592 +; CHECK-NEXT: mov v5.b[2], w2 +; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: ldr b19, [sp, #680] +; CHECK-NEXT: ld1 { v18.b }[5], [x11] +; CHECK-NEXT: smull v7.4s, v7.4h, v17.4h +; CHECK-NEXT: ldr b17, [sp, #416] +; CHECK-NEXT: ld1 { v16.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #688 ; CHECK-NEXT: add x12, sp, #328 -; CHECK-NEXT: mov v7.s[0], v19.s[0] -; CHECK-NEXT: ld1 { v5.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v17.b }[2], [x10] +; CHECK-NEXT: add x9, sp, #424 +; CHECK-NEXT: ld1 { v19.b }[1], [x10] ; CHECK-NEXT: add x10, sp, #600 -; CHECK-NEXT: ldr b19, [sp, #680] ; CHECK-NEXT: ldr b20, [sp, #616] ; CHECK-NEXT: ld1 { v2.b }[6], [x12] -; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-NEXT: add x11, sp, #400 ; CHECK-NEXT: ld1 { v18.b }[6], [x10] -; CHECK-NEXT: add x11, sp, #688 ; CHECK-NEXT: add x12, sp, #624 -; CHECK-NEXT: ld1 { v19.b }[1], [x11] +; CHECK-NEXT: mov v5.b[3], w3 +; CHECK-NEXT: ld1 { v16.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: ld1 { v20.b }[1], [x12] -; CHECK-NEXT: add x10, sp, #408 +; CHECK-NEXT: add x9, sp, #432 +; CHECK-NEXT: ld1 { v19.b }[2], [x11] ; CHECK-NEXT: add x11, sp, #608 -; CHECK-NEXT: add x12, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: ld1 { v17.b }[2], [x9] +; CHECK-NEXT: add x10, sp, #408 ; CHECK-NEXT: ld1 { v18.b }[7], [x11] -; CHECK-NEXT: ld1 { v17.b }[3], [x12] -; CHECK-NEXT: add x10, sp, #696 ; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #448 +; CHECK-NEXT: ld1 { v16.b }[7], [x10] ; CHECK-NEXT: ld1 { v20.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #640 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: mov v5.b[4], w4 ; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: add x12, sp, #440 ; CHECK-NEXT: ld1 { v19.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: add x12, sp, #520 -; CHECK-NEXT: ld1 { v20.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: ldr b21, [sp, #544] -; CHECK-NEXT: smull2 v22.4s, v6.8h, v18.8h -; CHECK-NEXT: smull v6.4s, v6.4h, v18.4h -; CHECK-NEXT: ldr b18, [sp, #744] -; CHECK-NEXT: ld1 { v19.b }[4], [x10] -; CHECK-NEXT: ld1 { v5.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #656 -; CHECK-NEXT: ld1 { v20.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #456 -; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: add x10, sp, #448 +; CHECK-NEXT: ld1 { v17.b }[3], [x12] +; CHECK-NEXT: add x12, sp, #640 +; CHECK-NEXT: sshll v21.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[3], [x12] ; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: add x11, sp, #712 +; CHECK-NEXT: mov v5.b[5], w5 +; CHECK-NEXT: ld1 { v19.b }[4], [x11] +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v17.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #648 +; CHECK-NEXT: ldr b22, [sp, #544] +; CHECK-NEXT: ld1 { v20.b }[4], [x10] +; CHECK-NEXT: smull2 v16.4s, v21.8h, v18.8h +; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h +; CHECK-NEXT: ldr b21, [sp, #744] +; CHECK-NEXT: add x11, sp, #720 +; CHECK-NEXT: ld1 { v6.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #456 ; CHECK-NEXT: ld1 { v19.b }[5], [x11] +; CHECK-NEXT: mov v5.b[6], w6 +; CHECK-NEXT: ld1 { v17.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: sshll v22.8h, v22.8b, #0 +; CHECK-NEXT: sshll v21.8h, v21.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[5], [x9] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: add x11, sp, #464 -; CHECK-NEXT: ld1 { v20.b }[5], [x12] -; CHECK-NEXT: ld1 { v5.b }[6], [x10] -; CHECK-NEXT: add x12, sp, #728 -; CHECK-NEXT: add x13, sp, #664 -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: ld1 { v17.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[6], [x12] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-NEXT: add x11, sp, #728 +; CHECK-NEXT: ld1 { v6.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #464 +; CHECK-NEXT: ld1 { v19.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #664 +; CHECK-NEXT: ld1 { v17.b }[6], [x10] +; CHECK-NEXT: smull v21.4s, v22.4h, v21.4h +; CHECK-NEXT: movi v22.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v20.b }[6], [x11] +; CHECK-NEXT: mov v5.b[7], w7 +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: add x10, sp, #736 +; CHECK-NEXT: add x11, sp, #208 +; CHECK-NEXT: add x13, sp, #264 +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ld1 { v19.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[7], [x11] +; CHECK-NEXT: add x9, sp, #472 +; CHECK-NEXT: add x10, sp, #672 +; CHECK-NEXT: ld1 { v3.b }[6], [x13] +; CHECK-NEXT: ld1 { v17.b }[7], [x9] +; CHECK-NEXT: ld1 { v20.b }[7], [x10] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v20.b }[6], [x13] -; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: smull v18.4s, v21.4h, v18.4h +; CHECK-NEXT: mov v22.s[0], v21.s[0] ; CHECK-NEXT: movi v21.2d, #0000000000000000 -; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 ; CHECK-NEXT: ld1 { v2.b }[7], [x8] -; CHECK-NEXT: ld1 { v3.b }[7], [x9] -; CHECK-NEXT: ld1 { v5.b }[7], [x10] -; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: add x9, sp, #736 -; CHECK-NEXT: add x10, sp, #672 -; CHECK-NEXT: ld1 { v17.b }[7], [x8] -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: ld1 { v20.b }[7], [x10] +; CHECK-NEXT: add x8, sp, #272 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: mov v21.s[0], v18.s[0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: smlal v18.4s, v6.4h, v19.4h +; CHECK-NEXT: smlal2 v16.4s, v6.8h, v19.8h +; CHECK-NEXT: mov v21.s[0], v7.s[0] +; CHECK-NEXT: smull v6.4s, v5.4h, v4.4h ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: sshll v19.8h, v20.8b, #0 -; CHECK-NEXT: smlal v16.4s, v0.4h, v2.4h +; CHECK-NEXT: smlal v22.4s, v17.4h, v20.4h +; CHECK-NEXT: smull2 v4.4s, v5.8h, v4.8h +; CHECK-NEXT: smlal v21.4s, v1.4h, v3.4h +; CHECK-NEXT: smlal2 v16.4s, v17.8h, v20.8h +; CHECK-NEXT: smlal v6.4s, v0.4h, v2.4h +; CHECK-NEXT: add v5.4s, v18.4s, v22.4s ; CHECK-NEXT: smlal2 v4.4s, v0.8h, v2.8h -; CHECK-NEXT: smlal v7.4s, v1.4h, v3.4h -; CHECK-NEXT: smlal v6.4s, v5.4h, v18.4h -; CHECK-NEXT: smlal2 v22.4s, v5.8h, v18.8h -; CHECK-NEXT: smlal v21.4s, v17.4h, v19.4h +; CHECK-NEXT: add v0.4s, v6.4s, v21.4s +; CHECK-NEXT: add v2.4s, v5.4s, v16.4s ; CHECK-NEXT: smlal2 v4.4s, v1.8h, v3.8h -; CHECK-NEXT: add v0.4s, v16.4s, v7.4s -; CHECK-NEXT: add v1.4s, v6.4s, v21.4s -; CHECK-NEXT: smlal2 v22.4s, v17.8h, v19.8h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v4.4s -; CHECK-NEXT: add v1.4s, v1.4s, v22.4s -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -1584,34 +1584,34 @@ entry: define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: ldp q4, q5, [x1] ; CHECK-NEXT: ldr b0, [x0, #32] ; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldp q2, q4, [x0] +; CHECK-NEXT: ldp q3, q6, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v6.8h, v2.8b, #0 -; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: ushll v7.8h, v4.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v4.16b, #0 -; CHECK-NEXT: ushll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v19.8h, v5.16b, #0 -; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: ushll v5.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: ushll2 v16.8h, v4.16b, #0 +; CHECK-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-NEXT: umull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: umull2 v1.4s, v7.8h, v6.8h -; CHECK-NEXT: umull2 v17.4s, v4.8h, v2.8h -; CHECK-NEXT: umull v2.4s, v4.4h, v2.4h -; CHECK-NEXT: umlal2 v17.4s, v19.8h, v16.8h -; CHECK-NEXT: umlal2 v1.4s, v5.8h, v3.8h -; CHECK-NEXT: mov v18.s[0], v0.s[0] -; CHECK-NEXT: umlal v2.4s, v19.4h, v16.4h -; CHECK-NEXT: add v0.4s, v1.4s, v17.4s -; CHECK-NEXT: umlal v18.4s, v7.4h, v6.4h -; CHECK-NEXT: umlal v18.4s, v5.4h, v3.4h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v18.4s, v0.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ushll2 v19.8h, v6.16b, #0 +; CHECK-NEXT: ushll v6.8h, v6.8b, #0 +; CHECK-NEXT: umull2 v17.4s, v7.8h, v5.8h +; CHECK-NEXT: umull2 v18.4s, v3.8h, v2.8h +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: umull v0.4s, v3.4h, v2.4h +; CHECK-NEXT: umlal2 v18.4s, v19.8h, v16.8h +; CHECK-NEXT: umlal2 v17.4s, v6.8h, v4.8h +; CHECK-NEXT: umlal v1.4s, v7.4h, v5.4h +; CHECK-NEXT: umlal v0.4s, v19.4h, v16.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: umlal v1.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1639,14 +1639,14 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-NEXT: uaddl2 v6.4s, v3.8h, v2.8h -; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: uaddl2 v6.4s, v5.8h, v4.8h ; CHECK-NEXT: mov v0.s[0], v1.s[0] -; CHECK-NEXT: uaddl2 v1.4s, v5.8h, v4.8h -; CHECK-NEXT: add v1.4s, v1.4s, v6.4s +; CHECK-NEXT: uaddl2 v1.4s, v3.8h, v2.8h +; CHECK-NEXT: uaddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: add v1.4s, v6.4s, v1.4s ; CHECK-NEXT: uaddw v0.4s, v0.4s, v5.4h -; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1660,34 +1660,34 @@ entry: define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: ldp q4, q5, [x1] ; CHECK-NEXT: ldr b0, [x0, #32] ; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldp q2, q4, [x0] +; CHECK-NEXT: ldp q3, q6, [x1] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v7.8h, v4.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v4.16b, #0 -; CHECK-NEXT: sshll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: sshll2 v19.8h, v5.16b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: sshll2 v16.8h, v4.16b, #0 +; CHECK-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 ; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h -; CHECK-NEXT: smull2 v1.4s, v7.8h, v6.8h -; CHECK-NEXT: smull2 v17.4s, v4.8h, v2.8h -; CHECK-NEXT: smull v2.4s, v4.4h, v2.4h -; CHECK-NEXT: smlal2 v17.4s, v19.8h, v16.8h -; CHECK-NEXT: smlal2 v1.4s, v5.8h, v3.8h -; CHECK-NEXT: mov v18.s[0], v0.s[0] -; CHECK-NEXT: smlal v2.4s, v19.4h, v16.4h -; CHECK-NEXT: add v0.4s, v1.4s, v17.4s -; CHECK-NEXT: smlal v18.4s, v7.4h, v6.4h -; CHECK-NEXT: smlal v18.4s, v5.4h, v3.4h -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v0.4s, v18.4s, v0.4s +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: sshll2 v19.8h, v6.16b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: smull2 v17.4s, v7.8h, v5.8h +; CHECK-NEXT: smull2 v18.4s, v3.8h, v2.8h +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: smull v0.4s, v3.4h, v2.4h +; CHECK-NEXT: smlal2 v18.4s, v19.8h, v16.8h +; CHECK-NEXT: smlal2 v17.4s, v6.8h, v4.8h +; CHECK-NEXT: smlal v1.4s, v7.4h, v5.4h +; CHECK-NEXT: smlal v0.4s, v19.4h, v16.4h +; CHECK-NEXT: add v2.4s, v17.4s, v18.4s +; CHECK-NEXT: smlal v1.4s, v6.4h, v4.4h +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -2018,151 +2018,151 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ldr b1, [sp, #80] +; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #144] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x12, sp, #32 -; CHECK-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x11, sp, #112 -; CHECK-NEXT: fmov s0, w0 -; CHECK-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ldr b4, [sp, #480] -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: add x10, sp, #152 +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-NEXT: ld1 { v2.b }[1], [x10] +; CHECK-NEXT: add x8, sp, #104 +; CHECK-NEXT: ldr b3, [sp, #16] +; CHECK-NEXT: ldr b5, [sp, #480] +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: add x10, sp, #24 +; CHECK-NEXT: add x13, sp, #488 +; CHECK-NEXT: ldr b4, [sp, #608] +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: ld1 { v2.b }[2], [x11] +; CHECK-NEXT: add x12, sp, #112 +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: ld1 { v5.b }[1], [x13] +; CHECK-NEXT: add x10, sp, #616 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: ld1 { v4.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #32 +; CHECK-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: add x13, sp, #48 -; CHECK-NEXT: ld1 { v3.b }[2], [x12] -; CHECK-NEXT: add x12, sp, #40 -; CHECK-NEXT: ldr b5, [sp, #608] -; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #120 ; CHECK-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-NEXT: mov v0.b[1], w1 -; CHECK-NEXT: add x9, sp, #128 -; CHECK-NEXT: add x14, sp, #184 -; CHECK-NEXT: ldr b16, [sp, #544] -; CHECK-NEXT: ld1 { v3.b }[3], [x12] -; CHECK-NEXT: add x12, sp, #176 -; CHECK-NEXT: ldr b17, [sp, #672] -; CHECK-NEXT: ld1 { v1.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #488 -; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: mov v0.b[2], w2 -; CHECK-NEXT: add x11, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x13] -; CHECK-NEXT: add x13, sp, #616 -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: ld1 { v1.b }[5], [x10] -; CHECK-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-NEXT: add x13, sp, #496 +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: ld1 { v3.b }[2], [x10] +; CHECK-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #176 +; CHECK-NEXT: ldr b6, [sp, #544] +; CHECK-NEXT: ld1 { v0.b }[4], [x12] +; CHECK-NEXT: add x14, sp, #552 +; CHECK-NEXT: ldr b7, [sp, #672] +; CHECK-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-NEXT: add x13, sp, #40 +; CHECK-NEXT: ld1 { v6.b }[1], [x14] +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: ld1 { v3.b }[3], [x13] +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #680 +; CHECK-NEXT: add x13, sp, #184 +; CHECK-NEXT: ld1 { v7.b }[1], [x9] +; CHECK-NEXT: ld1 { v2.b }[5], [x13] +; CHECK-NEXT: add x13, sp, #624 +; CHECK-NEXT: add x15, sp, #504 ; CHECK-NEXT: ld1 { v4.b }[2], [x13] -; CHECK-NEXT: ld1 { v2.b }[5], [x14] -; CHECK-NEXT: add x14, sp, #680 -; CHECK-NEXT: ld1 { v17.b }[1], [x14] -; CHECK-NEXT: add x13, sp, #504 -; CHECK-NEXT: ld1 { v3.b }[5], [x12] -; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: add x12, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v4.b }[3], [x13] -; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x10, sp, #136 +; CHECK-NEXT: ld1 { v0.b }[6], [x11] ; CHECK-NEXT: add x11, sp, #560 -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: ld1 { v17.b }[2], [x12] -; CHECK-NEXT: ld1 { v5.b }[2], [x9] -; CHECK-NEXT: ld1 { v1.b }[7], [x8] -; CHECK-NEXT: ld1 { v16.b }[2], [x11] -; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: mov v0.b[3], w3 -; CHECK-NEXT: ld1 { v4.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: add x11, sp, #632 -; CHECK-NEXT: ld1 { v17.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #520 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] -; CHECK-NEXT: ld1 { v5.b }[3], [x11] -; CHECK-NEXT: add x8, sp, #640 -; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: ld1 { v5.b }[3], [x15] +; CHECK-NEXT: ld1 { v6.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #688 +; CHECK-NEXT: mov v1.b[3], w3 +; CHECK-NEXT: ld1 { v7.b }[2], [x11] +; CHECK-NEXT: add x9, sp, #632 +; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #568 +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v6.b }[3], [x9] +; CHECK-NEXT: ld1 { v5.b }[4], [x11] +; CHECK-NEXT: ld1 { v7.b }[3], [x10] +; CHECK-NEXT: add x9, sp, #640 +; CHECK-NEXT: mov v1.b[4], w4 +; CHECK-NEXT: ld1 { v4.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: add x11, sp, #704 +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: add x11, sp, #520 +; CHECK-NEXT: ld1 { v6.b }[4], [x9] ; CHECK-NEXT: ldr b18, [sp, #736] -; CHECK-NEXT: mov v0.b[4], w4 -; CHECK-NEXT: ld1 { v17.b }[4], [x11] -; CHECK-NEXT: ld1 { v16.b }[4], [x9] -; CHECK-NEXT: ld1 { v5.b }[4], [x8] -; CHECK-NEXT: add x9, sp, #528 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x8, sp, #648 +; CHECK-NEXT: ld1 { v7.b }[4], [x10] +; CHECK-NEXT: ld1 { v5.b }[5], [x11] +; CHECK-NEXT: add x12, sp, #192 +; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: ld1 { v2.b }[6], [x12] +; CHECK-NEXT: add x9, sp, #648 +; CHECK-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-NEXT: add x10, sp, #528 ; CHECK-NEXT: add x11, sp, #584 ; CHECK-NEXT: add x12, sp, #712 -; CHECK-NEXT: ld1 { v4.b }[6], [x9] -; CHECK-NEXT: movi v7.2d, #0000000000000000 -; CHECK-NEXT: ld1 { v16.b }[5], [x11] -; CHECK-NEXT: ld1 { v17.b }[5], [x12] -; CHECK-NEXT: ld1 { v5.b }[5], [x8] -; CHECK-NEXT: mov v0.b[5], w5 -; CHECK-NEXT: add x9, sp, #536 -; CHECK-NEXT: sshll v18.4s, v18.4h, #0 -; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: ld1 { v7.b }[5], [x12] +; CHECK-NEXT: ld1 { v4.b }[5], [x9] +; CHECK-NEXT: ld1 { v5.b }[6], [x10] +; CHECK-NEXT: add x14, sp, #56 +; CHECK-NEXT: movi v17.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v3.b }[5], [x14] +; CHECK-NEXT: add x9, sp, #656 +; CHECK-NEXT: add x10, sp, #536 ; CHECK-NEXT: add x11, sp, #592 ; CHECK-NEXT: add x12, sp, #720 -; CHECK-NEXT: ld1 { v4.b }[7], [x9] -; CHECK-NEXT: ld1 { v16.b }[6], [x11] -; CHECK-NEXT: ld1 { v17.b }[6], [x12] -; CHECK-NEXT: ld1 { v5.b }[6], [x8] -; CHECK-NEXT: ldr b6, [sp, #208] -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: mov v7.s[0], v18.s[0] -; CHECK-NEXT: mov v0.b[6], w6 -; CHECK-NEXT: ld1 { v3.b }[6], [x10] +; CHECK-NEXT: sshll v18.4s, v18.4h, #0 +; CHECK-NEXT: ldr b16, [sp, #208] +; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: ld1 { v7.b }[6], [x12] +; CHECK-NEXT: ld1 { v4.b }[6], [x9] +; CHECK-NEXT: ld1 { v5.b }[7], [x10] +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: mov v1.b[6], w6 +; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #664 ; CHECK-NEXT: add x9, sp, #600 ; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-NEXT: mov v17.s[0], v18.s[0] +; CHECK-NEXT: ld1 { v6.b }[7], [x9] +; CHECK-NEXT: ld1 { v7.b }[7], [x10] +; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: movi v18.2d, #0000000000000000 -; CHECK-NEXT: mov v0.b[7], w7 +; CHECK-NEXT: sshll v16.4s, v16.4h, #0 +; CHECK-NEXT: mov v1.b[7], w7 ; CHECK-NEXT: add x9, sp, #200 -; CHECK-NEXT: add x10, sp, #72 -; CHECK-NEXT: saddw v7.4s, v7.4s, v4.4h -; CHECK-NEXT: sshll v6.4s, v6.4h, #0 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: add x8, sp, #72 ; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: ld1 { v3.b }[7], [x10] -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: mov v18.s[0], v6.s[0] +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: saddw v17.4s, v17.4s, v5.4h +; CHECK-NEXT: mov v18.s[0], v16.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: saddl2 v6.4s, v17.8h, v16.8h -; CHECK-NEXT: saddl2 v4.4s, v5.8h, v4.8h -; CHECK-NEXT: saddl v16.4s, v17.4h, v16.4h -; CHECK-NEXT: saddw v5.4s, v7.4s, v5.4h +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: saddl2 v17.4s, v0.8h, v1.8h -; CHECK-NEXT: saddw v0.4s, v18.4s, v0.4h +; CHECK-NEXT: saddl2 v16.4s, v7.8h, v6.8h +; CHECK-NEXT: saddl2 v5.4s, v4.8h, v5.8h +; CHECK-NEXT: saddl v6.4s, v7.4h, v6.4h +; CHECK-NEXT: saddw v4.4s, v17.4s, v4.4h +; CHECK-NEXT: saddl2 v17.4s, v1.8h, v0.8h +; CHECK-NEXT: saddw v1.4s, v18.4s, v1.4h ; CHECK-NEXT: saddl2 v7.4s, v3.8h, v2.8h -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h ; CHECK-NEXT: add v5.4s, v5.4s, v16.4s -; CHECK-NEXT: saddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: add v4.4s, v4.4s, v6.4s +; CHECK-NEXT: saddw v0.4s, v1.4s, v0.4h ; CHECK-NEXT: add v6.4s, v17.4s, v7.4s -; CHECK-NEXT: add v1.4s, v5.4s, v4.4s +; CHECK-NEXT: add v1.4s, v4.4s, v5.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v1.4s, v6.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll index 913205f327536..16200435c5c31 100644 --- a/llvm/test/CodeGen/AArch64/neon-extadd.ll +++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll @@ -515,15 +515,15 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) { ; CHECK-NEXT: mov v0.b[6], w6 ; CHECK-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-NEXT: uaddl v2.8h, v3.8b, v2.8b +; CHECK-NEXT: ushll v3.4s, v4.4h, #0 ; CHECK-NEXT: mov v0.b[7], w7 -; CHECK-NEXT: ushll2 v3.4s, v2.8h, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: ushll v1.4s, v4.4h, #0 -; CHECK-NEXT: stp q3, q1, [x8, #48] -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 +; CHECK-NEXT: ushll2 v1.4s, v2.8h, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-NEXT: stp q1, q3, [x8, #48] +; CHECK-NEXT: ushll2 v3.4s, v0.8h, #0 ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: stp q1, q2, [x8, #16] +; CHECK-NEXT: stp q3, q2, [x8, #16] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll index 881bbf315e8e9..45272143e8592 100644 --- a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll +++ b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll @@ -375,8 +375,8 @@ entry: define @shrn64x2( %a, i64 %b) { ; CHECK-LABEL: shrn64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -391,8 +391,8 @@ entry: define @shrn32x4( %a, i32 %b) { ; CHECK-LABEL: shrn32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -407,8 +407,8 @@ entry: define @shrn16x8( %a, i16 %b) { ; CHECK-LABEL: shrn16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -423,8 +423,8 @@ entry: define @shrn8x16( %a, i8 %b) { ; CHECK-LABEL: shrn8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret @@ -439,8 +439,8 @@ entry: define @lshrn64x2( %a, i64 %b) { ; CHECK-LABEL: lshrn64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -455,8 +455,8 @@ entry: define @lshrn32x4( %a, i32 %b) { ; CHECK-LABEL: lshrn32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -471,8 +471,8 @@ entry: define @lshrn16x8( %a, i16 %b) { ; CHECK-LABEL: lshrn16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -487,8 +487,8 @@ entry: define @lshrn8x16( %a, i8 %b) { ; CHECK-LABEL: lshrn8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret @@ -503,8 +503,8 @@ entry: define @shln64x2( %a, i64 %b) { ; CHECK-LABEL: shln64x2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: neg x8, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret @@ -519,8 +519,8 @@ entry: define @shln32x4( %a, i32 %b) { ; CHECK-LABEL: shln32x4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -535,8 +535,8 @@ entry: define @shln16x8( %a, i16 %b) { ; CHECK-LABEL: shln16x8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -551,8 +551,8 @@ entry: define @shln8x16( %a, i8 %b) { ; CHECK-LABEL: shln8x16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: neg w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll index 6b3cfc040cb3d..20088354bdb75 100644 --- a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll +++ b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll @@ -83,11 +83,11 @@ define @zext.add.2xi64( %a, @zext.add.8xi32( %a, %v) #0 { ; CHECK-LABEL: zext.add.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #1 // =0x1 +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = zext %v to %result = add %a, %extend @@ -103,8 +103,8 @@ define @zext.add.16xi32( %a, @zext.sub.2xi64( %a, @zext.sub.8xi32( %a, %v) #0 { ; CHECK-LABEL: zext.sub.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: add z0.s, p1/m, z0.s, z2.s ; CHECK-NEXT: add z1.s, p0/m, z1.s, z2.s @@ -214,8 +214,8 @@ define @zext.sub.16xi32( %a, @sext.add.2xi64( %a, @sext.add.8xi32( %a, %v) #0 { ; CHECK-LABEL: sext.add.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: add z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: add z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = sext %v to %result = add %a, %extend @@ -325,8 +325,8 @@ define @sext.add.16xi32( %a, @sext.sub.2xi64( %a, @sext.sub.8xi32( %a, %v) #0 { ; CHECK-LABEL: sext.sub.8xi32: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: mov z2.s, #-1 // =0xffffffffffffffff +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: sub z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: sub z0.s, p0/m, z0.s, z2.s +; CHECK-NEXT: sub z1.s, p1/m, z1.s, z2.s ; CHECK-NEXT: ret %extend = sext %v to %result = sub %a, %extend @@ -436,8 +436,8 @@ define @sext.sub.16xi32( %a, This Inner Loop Header: Depth=1 +; CHECK-NEXT: str q14, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr q14, [x8] ; CHECK-NEXT: mov x12, xzr -; CHECK-NEXT: str q18, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr x14, [x12] -; CHECK-NEXT: ldr q15, [x12] -; CHECK-NEXT: add x7, x11, x8 +; CHECK-NEXT: stp q29, q15, [sp] // 32-byte Folded Spill +; CHECK-NEXT: add x19, x11, x8 ; CHECK-NEXT: fmov x15, d14 ; CHECK-NEXT: mov x16, v14.d[1] -; CHECK-NEXT: ldr q18, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: fmov x18, d15 -; CHECK-NEXT: mov x13, v15.d[1] -; CHECK-NEXT: ldr x5, [x8] +; CHECK-NEXT: ldr q15, [x12] ; CHECK-NEXT: ldr q14, [x10], #64 -; CHECK-NEXT: ldr x7, [x7, #128] +; CHECK-NEXT: mov v8.16b, v28.16b +; CHECK-NEXT: fmov x13, d15 +; CHECK-NEXT: mov x18, v15.d[1] +; CHECK-NEXT: mov v28.16b, v24.16b ; CHECK-NEXT: mul x17, x15, x14 -; CHECK-NEXT: mov v6.16b, v0.16b -; CHECK-NEXT: mov v9.16b, v27.16b ; CHECK-NEXT: mov x12, v14.d[1] ; CHECK-NEXT: fmov x4, d14 -; CHECK-NEXT: mov v27.16b, v23.16b +; CHECK-NEXT: mov v24.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v5.16b ; CHECK-NEXT: mul x1, x16, x14 -; CHECK-NEXT: mov v23.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v7.16b -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: stp q26, q31, [sp] // 32-byte Folded Spill -; CHECK-NEXT: mov v31.16b, v22.16b -; CHECK-NEXT: mul x0, x18, x14 -; CHECK-NEXT: mov v26.16b, v10.16b -; CHECK-NEXT: mov v22.16b, v5.16b +; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldr x5, [x8] +; CHECK-NEXT: ldr x19, [x19, #128] +; CHECK-NEXT: mov v29.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v0.16b +; CHECK-NEXT: mul x0, x13, x14 +; CHECK-NEXT: mov v25.16b, v6.16b +; CHECK-NEXT: mov v6.16b, v2.16b ; CHECK-NEXT: fmov d15, x17 -; CHECK-NEXT: mov v5.16b, v1.16b -; CHECK-NEXT: mov v8.16b, v20.16b -; CHECK-NEXT: mul x2, x13, x14 -; CHECK-NEXT: mov v20.16b, v16.16b -; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: mov v10.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v4.16b +; CHECK-NEXT: mov v26.16b, v22.16b +; CHECK-NEXT: mov v22.16b, v18.16b +; CHECK-NEXT: mul x2, x18, x14 +; CHECK-NEXT: mov v18.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: mov v16.16b, v4.16b +; CHECK-NEXT: add x8, x8, #8 +; CHECK-NEXT: add x9, x9, #1 ; CHECK-NEXT: mov v15.d[1], x1 ; CHECK-NEXT: mul x3, x12, x14 -; CHECK-NEXT: add x8, x8, #8 -; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: cmp x8, #64 -; CHECK-NEXT: add x9, x9, #1 +; CHECK-NEXT: fmov d14, x0 ; CHECK-NEXT: mul x14, x4, x14 -; CHECK-NEXT: add v18.2d, v18.2d, v15.2d -; CHECK-NEXT: mul x19, x15, x5 +; CHECK-NEXT: add v5.2d, v5.2d, v15.2d +; CHECK-NEXT: mul x20, x15, x5 ; CHECK-NEXT: mov v14.d[1], x2 -; CHECK-NEXT: mul x15, x15, x7 +; CHECK-NEXT: mul x15, x15, x19 ; CHECK-NEXT: fmov d0, x14 -; CHECK-NEXT: str q18, [sp, #64] // 16-byte Folded Spill -; CHECK-NEXT: ldp q18, q15, [sp, #32] // 32-byte Folded Reload -; CHECK-NEXT: mul x6, x16, x5 -; CHECK-NEXT: fmov d1, x19 +; CHECK-NEXT: str q5, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: ldr q5, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: mul x21, x13, x19 +; CHECK-NEXT: add v5.2d, v5.2d, v14.2d +; CHECK-NEXT: fmov d3, x20 +; CHECK-NEXT: mul x7, x16, x5 ; CHECK-NEXT: mov v0.d[1], x3 -; CHECK-NEXT: mul x16, x16, x7 -; CHECK-NEXT: fmov d2, x15 -; CHECK-NEXT: add v15.2d, v15.2d, v14.2d -; CHECK-NEXT: mul x21, x18, x7 -; CHECK-NEXT: mov v1.d[1], x6 -; CHECK-NEXT: mul x0, x4, x7 -; CHECK-NEXT: str q15, [sp, #48] // 16-byte Folded Spill -; CHECK-NEXT: add v15.2d, v11.2d, v14.2d -; CHECK-NEXT: mov v2.d[1], x16 -; CHECK-NEXT: ldr q11, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: mul x20, x13, x7 -; CHECK-NEXT: fmov d3, x21 -; CHECK-NEXT: add v11.2d, v11.2d, v0.2d -; CHECK-NEXT: add v12.2d, v12.2d, v1.2d -; CHECK-NEXT: mul x22, x12, x7 -; CHECK-NEXT: fmov d4, x0 -; CHECK-NEXT: add v18.2d, v18.2d, v2.2d -; CHECK-NEXT: mov v2.16b, v7.16b -; CHECK-NEXT: mul x14, x18, x5 -; CHECK-NEXT: mov v7.16b, v19.16b -; CHECK-NEXT: mov v19.16b, v23.16b -; CHECK-NEXT: mov v3.d[1], x20 -; CHECK-NEXT: mov v23.16b, v27.16b -; CHECK-NEXT: add v27.2d, v9.2d, v1.2d -; CHECK-NEXT: mul x15, x4, x5 -; CHECK-NEXT: str q11, [sp, #80] // 16-byte Folded Spill -; CHECK-NEXT: mov v11.16b, v15.16b -; CHECK-NEXT: mov v4.d[1], x22 -; CHECK-NEXT: add v19.2d, v19.2d, v1.2d -; CHECK-NEXT: add v7.2d, v7.2d, v1.2d +; CHECK-NEXT: fmov d1, x15 +; CHECK-NEXT: mul x16, x16, x19 +; CHECK-NEXT: str q5, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: add v5.2d, v13.2d, v14.2d +; CHECK-NEXT: fmov d2, x21 +; CHECK-NEXT: ldr q13, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: mul x6, x18, x5 +; CHECK-NEXT: ldp q15, q14, [sp, #16] // 32-byte Folded Reload +; CHECK-NEXT: mov v3.d[1], x7 +; CHECK-NEXT: add v13.2d, v13.2d, v0.2d +; CHECK-NEXT: mul x18, x18, x19 +; CHECK-NEXT: mov v1.d[1], x16 +; CHECK-NEXT: mul x22, x4, x19 +; CHECK-NEXT: str q13, [sp, #80] // 16-byte Folded Spill +; CHECK-NEXT: mov v13.16b, v5.16b +; CHECK-NEXT: mov v5.16b, v17.16b +; CHECK-NEXT: mov v17.16b, v20.16b +; CHECK-NEXT: mov v20.16b, v24.16b ; CHECK-NEXT: mul x13, x13, x5 -; CHECK-NEXT: add v23.2d, v23.2d, v1.2d -; CHECK-NEXT: add v1.2d, v5.2d, v1.2d -; CHECK-NEXT: fmov d14, x14 -; CHECK-NEXT: add v30.2d, v30.2d, v3.2d -; CHECK-NEXT: mov v3.16b, v16.16b +; CHECK-NEXT: mov v24.16b, v28.16b +; CHECK-NEXT: add v11.2d, v11.2d, v3.2d +; CHECK-NEXT: mov v2.d[1], x18 +; CHECK-NEXT: add v15.2d, v15.2d, v1.2d +; CHECK-NEXT: add v27.2d, v27.2d, v3.2d +; CHECK-NEXT: mul x17, x12, x19 +; CHECK-NEXT: add v23.2d, v23.2d, v3.2d +; CHECK-NEXT: add v19.2d, v19.2d, v3.2d +; CHECK-NEXT: fmov d4, x22 +; CHECK-NEXT: add v10.2d, v10.2d, v3.2d +; CHECK-NEXT: mul x14, x4, x5 +; CHECK-NEXT: fmov d0, x13 +; CHECK-NEXT: add v14.2d, v14.2d, v2.2d +; CHECK-NEXT: add v2.2d, v6.2d, v3.2d ; CHECK-NEXT: mul x12, x12, x5 -; CHECK-NEXT: mov v16.16b, v20.16b -; CHECK-NEXT: mov v5.16b, v22.16b -; CHECK-NEXT: fmov d0, x15 -; CHECK-NEXT: add v28.2d, v28.2d, v4.2d -; CHECK-NEXT: mov v4.16b, v17.16b -; CHECK-NEXT: mov v17.16b, v21.16b -; CHECK-NEXT: mov v21.16b, v10.16b -; CHECK-NEXT: mov v10.16b, v26.16b -; CHECK-NEXT: mov v14.d[1], x13 -; CHECK-NEXT: mov v22.16b, v31.16b -; CHECK-NEXT: ldp q26, q31, [sp] // 32-byte Folded Reload -; CHECK-NEXT: mov v0.d[1], x12 -; CHECK-NEXT: add v13.2d, v13.2d, v14.2d -; CHECK-NEXT: add v31.2d, v31.2d, v14.2d -; CHECK-NEXT: add v26.2d, v26.2d, v14.2d -; CHECK-NEXT: add v24.2d, v24.2d, v14.2d -; CHECK-NEXT: add v22.2d, v22.2d, v14.2d -; CHECK-NEXT: add v20.2d, v8.2d, v14.2d -; CHECK-NEXT: add v10.2d, v10.2d, v14.2d -; CHECK-NEXT: add v16.2d, v16.2d, v14.2d -; CHECK-NEXT: add v5.2d, v5.2d, v14.2d -; CHECK-NEXT: add v3.2d, v3.2d, v14.2d -; CHECK-NEXT: add v2.2d, v2.2d, v14.2d -; CHECK-NEXT: add v29.2d, v29.2d, v0.2d -; CHECK-NEXT: add v25.2d, v25.2d, v0.2d -; CHECK-NEXT: add v21.2d, v21.2d, v0.2d +; CHECK-NEXT: mov v3.16b, v7.16b +; CHECK-NEXT: mov v7.16b, v18.16b +; CHECK-NEXT: mov v4.d[1], x17 +; CHECK-NEXT: mov v18.16b, v22.16b +; CHECK-NEXT: mov v0.d[1], x6 +; CHECK-NEXT: fmov d1, x14 +; CHECK-NEXT: add v28.2d, v8.2d, v4.2d +; CHECK-NEXT: mov v1.d[1], x12 +; CHECK-NEXT: add v31.2d, v31.2d, v0.2d +; CHECK-NEXT: add v30.2d, v30.2d, v0.2d +; CHECK-NEXT: add v12.2d, v12.2d, v0.2d +; CHECK-NEXT: add v24.2d, v24.2d, v0.2d +; CHECK-NEXT: add v22.2d, v26.2d, v0.2d +; CHECK-NEXT: add v20.2d, v20.2d, v0.2d +; CHECK-NEXT: add v18.2d, v18.2d, v0.2d ; CHECK-NEXT: add v17.2d, v17.2d, v0.2d -; CHECK-NEXT: add v4.2d, v4.2d, v0.2d -; CHECK-NEXT: add v0.2d, v6.2d, v0.2d +; CHECK-NEXT: add v7.2d, v7.2d, v0.2d +; CHECK-NEXT: add v4.2d, v16.2d, v0.2d +; CHECK-NEXT: add v3.2d, v3.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v21.16b +; CHECK-NEXT: mov v21.16b, v29.16b +; CHECK-NEXT: ldr q29, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add v9.2d, v9.2d, v1.2d +; CHECK-NEXT: add v6.2d, v25.2d, v1.2d +; CHECK-NEXT: add v5.2d, v5.2d, v1.2d +; CHECK-NEXT: add v29.2d, v29.2d, v1.2d +; CHECK-NEXT: add v21.2d, v21.2d, v1.2d +; CHECK-NEXT: add v0.2d, v0.2d, v1.2d ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: ldr q6, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: adrp x8, C ; CHECK-NEXT: add x8, x8, :lo12:C -; CHECK-NEXT: stp q12, q31, [x8, #80] +; CHECK-NEXT: stp q11, q30, [x8, #80] ; CHECK-NEXT: ldp x20, x19, [sp, #176] // 16-byte Folded Reload -; CHECK-NEXT: str q6, [x8] -; CHECK-NEXT: ldr q6, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: str q29, [x8, #112] +; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: stp q15, q14, [x8, #144] ; CHECK-NEXT: ldp x22, x21, [sp, #160] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q11, [x8, #16] -; CHECK-NEXT: ldr q6, [sp, #80] // 16-byte Folded Reload -; CHECK-NEXT: stp q18, q30, [x8, #144] -; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload -; CHECK-NEXT: stp q6, q13, [x8, #48] +; CHECK-NEXT: stp q1, q13, [x8, #16] +; CHECK-NEXT: ldr q1, [sp, #80] // 16-byte Folded Reload +; CHECK-NEXT: stp q28, q12, [x8, #176] ; CHECK-NEXT: ldp d13, d12, [sp, #112] // 16-byte Folded Reload -; CHECK-NEXT: stp q28, q26, [x8, #176] +; CHECK-NEXT: stp q1, q31, [x8, #48] ; CHECK-NEXT: ldp d15, d14, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: stp q19, q10, [x8, #336] +; CHECK-NEXT: stp q9, q24, [x8, #240] +; CHECK-NEXT: ldp d9, d8, [sp, #144] // 16-byte Folded Reload +; CHECK-NEXT: stp q19, q18, [x8, #336] +; CHECK-NEXT: stp q10, q7, [x8, #400] ; CHECK-NEXT: ldp d11, d10, [sp, #128] // 16-byte Folded Reload +; CHECK-NEXT: str q29, [x8, #112] ; CHECK-NEXT: str q27, [x8, #208] -; CHECK-NEXT: stp q25, q24, [x8, #240] ; CHECK-NEXT: stp q23, q22, [x8, #272] ; CHECK-NEXT: stp q21, q20, [x8, #304] -; CHECK-NEXT: stp q17, q16, [x8, #368] -; CHECK-NEXT: stp q7, q5, [x8, #400] -; CHECK-NEXT: stp q4, q3, [x8, #432] -; CHECK-NEXT: stp q1, q2, [x8, #464] +; CHECK-NEXT: stp q6, q17, [x8, #368] +; CHECK-NEXT: stp q5, q4, [x8, #432] +; CHECK-NEXT: stp q2, q3, [x8, #464] ; CHECK-NEXT: str q0, [x8, #496] ; CHECK-NEXT: add sp, sp, #192 ; CHECK-NEXT: .cfi_def_cfa_offset 0 diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll index 47e3381517499..6b03e5d12bfd3 100644 --- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll +++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll @@ -8,8 +8,8 @@ define hidden @test_load_sve_lane0(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { ; CHECK-LABEL: test_load_sve_lane0: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: ldapr x8, [x0] +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %1 = load atomic i64, ptr %a acquire, align 8 @@ -20,9 +20,9 @@ define hidden @test_load_sve_lane0(ptr nocapture noundef read define hidden @test_load_sve_lane1(ptr nocapture noundef readonly %a, noundef %b) local_unnamed_addr { ; CHECK-LABEL: test_load_sve_lane1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: ldapr x8, [x0] ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d diff --git a/llvm/test/CodeGen/AArch64/reassocmls.ll b/llvm/test/CodeGen/AArch64/reassocmls.ll index 381caffba92eb..acbf9fc584a2e 100644 --- a/llvm/test/CodeGen/AArch64/reassocmls.ll +++ b/llvm/test/CodeGen/AArch64/reassocmls.ll @@ -79,7 +79,7 @@ define i64 @mls_i64_C(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e) { ; CHECK-LABEL: mls_i64_C: ; CHECK: // %bb.0: ; CHECK-NEXT: mul x8, x2, x1 -; CHECK-NEXT: mov w9, #10 +; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: madd x8, x4, x3, x8 ; CHECK-NEXT: sub x0, x9, x8 ; CHECK-NEXT: ret @@ -290,9 +290,9 @@ define @smlsl_nxv8i16( %a, @umlsl_nxv8i16( %a, %b, %c, %d, %e) { ; CHECK-LABEL: umlsl_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z3.h, z3.h, #0xff ; CHECK-NEXT: and z4.h, z4.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z2.h, z2.h, #0xff ; CHECK-NEXT: mls z0.h, p0/m, z4.h, z3.h @@ -326,8 +326,8 @@ define @mls_nxv8i16( %a, define @mla_nxv8i16( %a, %b, %c, %d, %e) { ; CHECK-LABEL: mla_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z1.h, z2.h, z1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mla z1.h, p0/m, z4.h, z3.h ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index a080a7403811f..325ab444205bf 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -20,111 +20,111 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ldr d6, [x10, x8] ; CHECK-NEXT: ldr d5, [x11] ; CHECK-NEXT: ldr d7, [x11, x9] -; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b -; CHECK-NEXT: usubl v1.8h, v4.8b, v5.8b +; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b -; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 +; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 ; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 -; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddw v0.4s, v5.4s, v0.4h +; CHECK-NEXT: shll2 v7.4s, v2.8h, #16 +; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h ; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h -; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h +; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h +; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s +; CHECK-NEXT: zip2 v6.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s ; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: zip1 v4.4s, v2.4s, v0.4s -; CHECK-NEXT: zip2 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: uzp2 v5.4s, v3.4s, v1.4s -; CHECK-NEXT: mov v17.16b, v1.16b -; CHECK-NEXT: zip2 v16.4s, v1.4s, v3.4s -; CHECK-NEXT: mov v7.s[3], v0.s[2] -; CHECK-NEXT: ext v18.16b, v3.16b, v3.16b, #12 -; CHECK-NEXT: ext v2.16b, v2.16b, v4.16b, #8 -; CHECK-NEXT: mov v17.s[1], v3.s[0] +; CHECK-NEXT: ext v17.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: zip2 v18.4s, v3.4s, v2.4s +; CHECK-NEXT: ext v16.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: mov v1.s[3], v0.s[2] +; CHECK-NEXT: mov v7.s[1], v3.s[0] ; CHECK-NEXT: uzp2 v0.4s, v5.4s, v3.4s -; CHECK-NEXT: zip2 v5.4s, v3.4s, v1.4s -; CHECK-NEXT: mov v3.s[0], v1.s[1] -; CHECK-NEXT: ext v1.16b, v1.16b, v18.16b, #12 -; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v17.d[1], v2.d[1] +; CHECK-NEXT: zip2 v5.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v3.s[0], v2.s[1] +; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #12 +; CHECK-NEXT: mov v18.d[1], v1.d[1] +; CHECK-NEXT: mov v7.d[1], v16.d[1] ; CHECK-NEXT: mov v0.d[1], v6.d[1] -; CHECK-NEXT: mov v5.d[1], v7.d[1] ; CHECK-NEXT: mov v3.d[1], v4.d[1] -; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: add v2.4s, v3.4s, v17.4s -; CHECK-NEXT: sub v3.4s, v17.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v16.4s, v1.4s +; CHECK-NEXT: mov v5.d[1], v1.d[1] +; CHECK-NEXT: mov v2.d[1], v6.d[1] +; CHECK-NEXT: add v0.4s, v0.4s, v18.4s +; CHECK-NEXT: add v1.4s, v3.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v3.4s +; CHECK-NEXT: sub v2.4s, v5.4s, v2.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: add v6.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: rev64 v6.4s, v1.4s +; CHECK-NEXT: sub v5.4s, v3.4s, v2.4s +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: rev64 v3.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s -; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: rev64 v4.4s, v6.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: addp v16.4s, v0.4s, v6.4s -; CHECK-NEXT: addp v17.4s, v2.4s, v1.4s -; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s -; CHECK-NEXT: ext v5.16b, v17.16b, v1.16b, #4 -; CHECK-NEXT: ext v6.16b, v16.16b, v4.16b, #4 -; CHECK-NEXT: mov v18.16b, v1.16b -; CHECK-NEXT: mov v19.16b, v4.16b -; CHECK-NEXT: ext v3.16b, v2.16b, v17.16b, #8 -; CHECK-NEXT: ext v7.16b, v0.16b, v16.16b, #4 -; CHECK-NEXT: mov v18.s[2], v17.s[3] -; CHECK-NEXT: zip2 v5.4s, v5.4s, v17.4s -; CHECK-NEXT: zip2 v6.4s, v6.4s, v16.4s -; CHECK-NEXT: mov v19.s[2], v16.s[3] -; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s -; CHECK-NEXT: ext v20.16b, v3.16b, v2.16b, #4 -; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 -; CHECK-NEXT: mov v2.s[2], v17.s[1] -; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #12 -; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 -; CHECK-NEXT: mov v5.16b, v18.16b -; CHECK-NEXT: uzp2 v3.4s, v3.4s, v20.4s -; CHECK-NEXT: mov v6.16b, v7.16b -; CHECK-NEXT: mov v20.16b, v19.16b -; CHECK-NEXT: mov v21.16b, v2.16b -; CHECK-NEXT: mov v5.s[1], v17.s[2] -; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s -; CHECK-NEXT: mov v6.s[0], v16.s[1] -; CHECK-NEXT: mov v20.s[1], v16.s[2] -; CHECK-NEXT: sub v16.4s, v19.4s, v4.4s -; CHECK-NEXT: mov v21.s[1], v17.s[0] -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v17.4s, v18.4s, v1.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: mov v6.d[1], v1.d[1] +; CHECK-NEXT: rev64 v3.4s, v5.4s +; CHECK-NEXT: rev64 v7.4s, v2.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v6.4s -; CHECK-NEXT: add v4.4s, v20.4s, v4.4s -; CHECK-NEXT: add v3.4s, v21.4s, v3.4s -; CHECK-NEXT: mov v1.d[1], v17.d[1] -; CHECK-NEXT: mov v0.d[1], v7.d[1] -; CHECK-NEXT: mov v4.d[1], v16.d[1] -; CHECK-NEXT: mov v3.d[1], v2.d[1] +; CHECK-NEXT: sub v3.4s, v5.4s, v3.4s +; CHECK-NEXT: addp v4.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v5.4s, v2.4s, v7.4s +; CHECK-NEXT: addp v2.4s, v0.4s, v2.4s +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v1.4s +; CHECK-NEXT: ext v16.16b, v4.16b, v3.16b, #4 +; CHECK-NEXT: ext v17.16b, v2.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v7.4s +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: zip2 v6.4s, v16.4s, v4.4s +; CHECK-NEXT: mov v16.16b, v5.16b +; CHECK-NEXT: zip2 v17.4s, v17.4s, v2.4s +; CHECK-NEXT: ext v18.16b, v0.16b, v2.16b, #4 +; CHECK-NEXT: mov v7.s[2], v4.s[3] +; CHECK-NEXT: mov v21.16b, v1.16b +; CHECK-NEXT: mov v16.s[2], v2.s[3] +; CHECK-NEXT: ext v5.16b, v5.16b, v17.16b, #12 +; CHECK-NEXT: zip1 v17.4s, v2.4s, v2.4s +; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 +; CHECK-NEXT: mov v19.16b, v7.16b +; CHECK-NEXT: ext v6.16b, v1.16b, v4.16b, #8 +; CHECK-NEXT: mov v21.s[2], v4.s[1] +; CHECK-NEXT: mov v20.16b, v16.16b +; CHECK-NEXT: mov v19.s[1], v4.s[2] +; CHECK-NEXT: trn2 v0.4s, v17.4s, v0.4s +; CHECK-NEXT: sub v16.4s, v16.4s, v5.4s +; CHECK-NEXT: mov v17.16b, v18.16b +; CHECK-NEXT: ext v1.16b, v6.16b, v1.16b, #4 +; CHECK-NEXT: sub v7.4s, v7.4s, v3.4s +; CHECK-NEXT: mov v20.s[1], v2.s[2] +; CHECK-NEXT: mov v17.s[0], v2.s[1] +; CHECK-NEXT: mov v2.16b, v21.16b +; CHECK-NEXT: add v3.4s, v19.4s, v3.4s +; CHECK-NEXT: uzp2 v1.4s, v6.4s, v1.4s +; CHECK-NEXT: add v5.4s, v20.4s, v5.4s +; CHECK-NEXT: mov v2.s[1], v4.s[0] +; CHECK-NEXT: sub v4.4s, v0.4s, v18.4s +; CHECK-NEXT: mov v3.d[1], v7.d[1] +; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: mov v5.d[1], v16.d[1] +; CHECK-NEXT: sub v6.4s, v21.4s, v1.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: cmlt v4.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v2.8h, v5.8h, #0 +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: add v3.4s, v4.4s, v3.4s +; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 +; CHECK-NEXT: add v5.4s, v2.4s, v5.4s +; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b ; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 -; CHECK-NEXT: cmlt v2.8h, v0.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v5.8h, v3.8h, #0 +; CHECK-NEXT: add v0.4s, v6.4s, v0.4s +; CHECK-NEXT: eor v2.16b, v5.16b, v2.16b ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v4.4s, v6.4s, v4.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s +; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b -; CHECK-NEXT: eor v0.16b, v0.16b, v2.16b -; CHECK-NEXT: eor v2.16b, v3.16b, v5.16b -; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -278,13 +278,13 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: add v2.4s, v2.4s, v16.4s ; CHECK-NEXT: add v3.4s, v4.4s, v17.4s +; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: sub v1.4s, v17.4s, v4.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s ; CHECK-NEXT: rev64 v6.4s, v3.4s +; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: sub v4.4s, v1.4s, v0.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: mov v5.d[1], v2.d[1] ; CHECK-NEXT: mov v6.d[1], v3.d[1] ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s ; CHECK-NEXT: add v1.4s, v2.4s, v6.4s @@ -304,40 +304,40 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: mov v4.d[1], v16.d[1] ; CHECK-NEXT: mov v1.d[1], v7.d[1] ; CHECK-NEXT: add v0.4s, v17.4s, v1.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s ; CHECK-NEXT: add v2.4s, v18.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v17.4s ; CHECK-NEXT: sub v3.4s, v4.4s, v18.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v1.4s -; CHECK-NEXT: ext v5.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v7.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s -; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v5.16b, v1.16b, #8 -; CHECK-NEXT: ext v18.16b, v6.16b, v3.16b, #8 -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v2.4s -; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 -; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 -; CHECK-NEXT: cmlt v2.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v4.4s, v2.4s, v4.4s -; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v1.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v3.16b, #8 +; CHECK-NEXT: zip1 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: add v2.4s, v16.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: cmlt v4.8h, v1.8h, #0 -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s -; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: add v3.4s, v7.4s, v4.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b -; CHECK-NEXT: add v0.4s, v0.4s, v2.4s -; CHECK-NEXT: eor v1.16b, v1.16b, v4.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 @@ -461,93 +461,93 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: ldr d3, [x11, x9] ; CHECK-NEXT: ldr d4, [x10] ; CHECK-NEXT: ldr d5, [x11] +; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b -; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b -; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 -; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 -; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 -; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h -; CHECK-NEXT: rev64 v4.4s, v0.4s +; CHECK-NEXT: shll2 v3.4s, v1.8h, #16 +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h +; CHECK-NEXT: shll2 v5.4s, v2.8h, #16 +; CHECK-NEXT: saddw v1.4s, v3.4s, v1.4h +; CHECK-NEXT: shll2 v3.4s, v4.8h, #16 +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: saddw v2.4s, v5.4s, v2.4h ; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s +; CHECK-NEXT: saddw v3.4s, v3.4s, v4.4h +; CHECK-NEXT: rev64 v4.4s, v2.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s +; CHECK-NEXT: rev64 v7.4s, v3.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s ; CHECK-NEXT: addp v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v7.4s +; CHECK-NEXT: ext v1.16b, v5.16b, v6.16b, #4 +; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: ext v7.16b, v5.16b, v4.16b, #4 -; CHECK-NEXT: mov v4.s[3], v5.s[2] -; CHECK-NEXT: zip2 v16.4s, v6.4s, v1.4s -; CHECK-NEXT: zip1 v1.4s, v6.4s, v1.4s -; CHECK-NEXT: uzp2 v6.4s, v2.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v7.16b, v5.16b, #4 +; CHECK-NEXT: mov v6.s[3], v5.s[2] +; CHECK-NEXT: zip2 v16.4s, v4.4s, v7.4s +; CHECK-NEXT: zip1 v4.4s, v4.4s, v7.4s +; CHECK-NEXT: ext v1.16b, v1.16b, v5.16b, #4 +; CHECK-NEXT: uzp2 v5.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v0.4s, v2.4s, v0.4s ; CHECK-NEXT: uzp1 v7.4s, v2.4s, v3.4s ; CHECK-NEXT: uzp2 v2.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v16.d[1], v4.d[1] -; CHECK-NEXT: rev64 v3.4s, v6.4s -; CHECK-NEXT: mov v1.d[1], v5.d[1] +; CHECK-NEXT: mov v16.d[1], v6.d[1] +; CHECK-NEXT: mov v4.d[1], v1.d[1] +; CHECK-NEXT: rev64 v1.4s, v5.4s ; CHECK-NEXT: rev64 v0.4s, v0.4s ; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v1.4s, v16.4s -; CHECK-NEXT: add v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v1.4s, v16.4s, v1.4s -; CHECK-NEXT: zip1 v3.4s, v2.4s, v4.4s +; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v16.4s, v4.4s +; CHECK-NEXT: zip1 v4.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s ; CHECK-NEXT: uzp2 v6.4s, v0.4s, v1.4s ; CHECK-NEXT: zip2 v7.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v4.4s -; CHECK-NEXT: ext v16.16b, v2.16b, v3.16b, #8 +; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #8 ; CHECK-NEXT: trn2 v5.4s, v0.4s, v5.4s ; CHECK-NEXT: uzp2 v6.4s, v6.4s, v0.4s -; CHECK-NEXT: mov v2.s[3], v4.s[2] +; CHECK-NEXT: mov v2.s[3], v3.s[2] ; CHECK-NEXT: mov v0.s[1], v1.s[1] ; CHECK-NEXT: mov v5.d[1], v16.d[1] ; CHECK-NEXT: mov v6.d[1], v17.d[1] ; CHECK-NEXT: mov v7.d[1], v2.d[1] -; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: mov v0.d[1], v4.d[1] ; CHECK-NEXT: add v1.4s, v6.4s, v7.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v6.4s -; CHECK-NEXT: add v3.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v5.4s, v0.4s +; CHECK-NEXT: sub v3.4s, v7.4s, v6.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: ext v6.16b, v3.16b, v3.16b, #4 -; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v16.4s, v0.4s, v3.4s -; CHECK-NEXT: zip2 v17.4s, v3.4s, v0.4s -; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s -; CHECK-NEXT: ext v2.16b, v5.16b, v2.16b, #8 -; CHECK-NEXT: ext v18.16b, v6.16b, v0.16b, #8 -; CHECK-NEXT: zip1 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: add v3.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v17.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v5.16b, #4 -; CHECK-NEXT: ext v5.16b, v18.16b, v6.16b, #4 +; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v3.4s, v1.4s +; CHECK-NEXT: zip2 v16.4s, v0.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v0.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v3.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v2.4s, v16.4s, v7.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s -; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 -; CHECK-NEXT: cmlt v6.8h, v3.8h, #0 -; CHECK-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-NEXT: add v4.4s, v1.4s, v4.4s -; CHECK-NEXT: add v2.4s, v5.4s, v2.4s +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s +; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b +; CHECK-NEXT: add v3.4s, v7.4s, v4.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s -; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: lsr w9, w8, #16 diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll index 86c224bee990a..2deb19be24821 100644 --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) { ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #213 +; CHECK-NEXT: movi v2.16b, #42 ; CHECK-NEXT: umin v0.16b, v0.16b, v1.16b -; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: add v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %c = icmp ult <16 x i8> %x, %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> @@ -384,9 +384,9 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) { ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min: ; CHECK: // %bb.0: ; CHECK-NEXT: mvni v1.8h, #42 +; CHECK-NEXT: movi v2.8h, #42 ; CHECK-NEXT: umin v0.8h, v0.8h, v1.8h -; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-NEXT: add v0.8h, v0.8h, v2.8h ; CHECK-NEXT: ret %c = icmp ult <8 x i16> %x, %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 3e0d5dd875097..5237a3491de9b 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -245,15 +245,15 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) { ; CHECK-GI-LABEL: sext_v3i8_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #24 // =0x18 -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: neg v2.4s, v0.4s -; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i8> %a to <3 x i32> @@ -408,15 +408,15 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) { ; CHECK-GI-LABEL: sext_v3i10_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov w8, #22 // =0x16 -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: mov v1.s[1], w1 -; CHECK-GI-NEXT: mov v0.s[1], w8 -; CHECK-GI-NEXT: mov v1.s[2], w2 -; CHECK-GI-NEXT: mov v0.s[2], w8 -; CHECK-GI-NEXT: neg v2.4s, v0.4s -; CHECK-GI-NEXT: ushl v0.4s, v1.4s, v0.4s -; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: fmov s0, w0 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: mov v0.s[1], w1 +; CHECK-GI-NEXT: mov v1.s[1], w8 +; CHECK-GI-NEXT: mov v0.s[2], w2 +; CHECK-GI-NEXT: mov v1.s[2], w8 +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s +; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: sshl v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: ret entry: %c = sext <3 x i10> %a to <3 x i32> diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll index f4f75bb9c7825..88e062d2c999c 100644 --- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll +++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll @@ -158,8 +158,8 @@ define i32 @sink_sub_from_const_to_sub2(i32 %a, i32 %b) { define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI12_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -170,8 +170,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI13_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI13_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -186,8 +186,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI14_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI14_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -198,8 +198,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI15_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI15_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -214,8 +214,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_add0: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -226,8 +226,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_add1: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -242,8 +242,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI18_0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -254,8 +254,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_add_of_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI19_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -270,8 +270,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -282,8 +282,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_of_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: adrp x8, .LCPI21_0 +; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI21_0] ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret @@ -298,8 +298,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_sub: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI22_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] ; CHECK-NEXT: sub v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ret @@ -310,8 +310,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) { define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: vec_sink_sub_from_const_to_sub2: ; CHECK: // %bb.0: -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: adrp x8, .LCPI23_0 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll index 0c674c5685e00..1d1bae42c9e30 100644 --- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll +++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll @@ -28,8 +28,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -61,8 +61,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -94,8 +94,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -127,8 +127,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl use_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload @@ -165,8 +165,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -205,8 +205,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: bl use_f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -245,8 +245,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -290,8 +290,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -331,8 +331,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -372,8 +372,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -413,8 +413,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -454,8 +454,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -495,8 +495,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -536,8 +536,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -581,8 +581,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v16i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -621,8 +621,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -661,8 +661,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -701,8 +701,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -741,8 +741,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8f16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -781,8 +781,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v8bf16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -821,8 +821,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v4f32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -861,8 +861,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: bl use_v2f64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr z0, [x8] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: addvl sp, sp, #1 @@ -894,9 +894,9 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 { ; CHECK-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: add x8, sp, #16 ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -942,13 +942,13 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i8 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1b { z0.b }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i8 @get_i8() @@ -969,13 +969,13 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i16 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1h { z0.h }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i16 @get_i16() @@ -996,13 +996,13 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i32 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov s0, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1w { z0.s }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i32 @get_i32() @@ -1023,13 +1023,13 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 { ; CHECK-NEXT: smstop sm ; CHECK-NEXT: bl get_i64 ; CHECK-NEXT: smstart sm -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov d0, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload -; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: st1d { z0.d }, p0, [x19] ; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp], #80 // 16-byte Folded Reload ; CHECK-NEXT: ret %res = call i64 @get_i64() @@ -1056,11 +1056,11 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call half @get_f16() @@ -1086,11 +1086,11 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr s0, [sp, #12] // 4-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call float @get_f32() @@ -1116,11 +1116,11 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call double @get_f64() @@ -1150,11 +1150,11 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i8> @get_v1i8() @@ -1181,11 +1181,11 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i16> @get_v1i16() @@ -1212,11 +1212,11 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i32> @get_v1i32() @@ -1243,11 +1243,11 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x i64> @get_v1i64() @@ -1275,11 +1275,11 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr h0, [sp, #14] // 2-byte Folded Reload ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x half> @get_v1f16() @@ -1306,11 +1306,11 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x float> @get_v1f32() @@ -1337,11 +1337,11 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr d0, [sp, #8] // 8-byte Folded Reload ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <1 x double> @get_v1f64() @@ -1373,11 +1373,11 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1b { z0.b }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1b { z0.b }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <16 x i8> @get_v16i8() @@ -1404,11 +1404,11 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x i16> @get_v8i16() @@ -1435,11 +1435,11 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x i32> @get_v4i32() @@ -1466,11 +1466,11 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x i64> @get_v2i64() @@ -1497,11 +1497,11 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1h { z0.h }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1h { z0.h }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <8 x half> @get_v8f16() @@ -1528,11 +1528,11 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1w { z0.s }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1w { z0.s }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <4 x float> @get_v4f32() @@ -1559,11 +1559,11 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 { ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: ldp d9, d8, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: st1d { z0.d }, p0, [x19] +; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: ldp d11, d10, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: ldp d13, d12, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: ldp d15, d14, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: st1d { z0.d }, p0, [x19] -; CHECK-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload ; CHECK-NEXT: add sp, sp, #96 ; CHECK-NEXT: ret %res = call <2 x double> @get_v2f64() diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll index 47b24290d3c85..1e16f140676ba 100644 --- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll @@ -151,8 +151,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) " ; CHECK-NEXT: // %bb.3: ; CHECK-NEXT: smstart sm ; CHECK-NEXT: .LBB4_4: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: add x8, sp, #16 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll index 0097968b1171d..b4fd5a2272e7e 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll @@ -26,16 +26,16 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, %unused, < define void @fdot_multi_za32_f16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -71,16 +71,16 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, %unused, define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll index 6d98604837115..e154a4df86efe 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll @@ -26,16 +26,16 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -68,16 +68,16 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -110,16 +110,16 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @udot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: udot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -152,16 +152,16 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, %unused, < define void @usdot_multi_za32_u8_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -197,16 +197,16 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za32_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -239,16 +239,16 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, %unused, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -281,16 +281,16 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, %unused, < define void @sdot_multi_za64_u16_vg1x4(i32 %slice, %unused, %zn0, %zn1, %zn2, %zn3, ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll index e95d29f65e55e..92e8877927ea5 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SMAX (Single, x2) @@ -151,8 +152,7 @@ define { , } @multi_vec_max_single_x2 ; SMAX (Single, x4) -define { , , , } -@multi_vec_max_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -170,8 +170,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -189,8 +188,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -208,8 +206,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -229,8 +226,7 @@ define { , , , , , , } -@multi_vec_max_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -248,8 +244,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -267,8 +262,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -286,8 +280,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -307,8 +300,7 @@ define { , , , , , , } -@multi_vec_max_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -326,8 +318,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -345,8 +336,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_max_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_max_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_max_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -537,104 +527,100 @@ define { , } @multi_vec_max_multi_x2_ ; SMAX (Multi, x4) -define { , , , } -@multi_vec_max_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smax.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -643,104 +629,100 @@ define { , , , , , , } -@multi_vec_max_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_u64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umax.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -749,78 +731,75 @@ define { , , , , , , } -@multi_vec_max_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_max_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_max_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_max_multi_x4_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmax.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -870,8 +849,7 @@ define { , } @multi_vec_maxnm_single ; FMAXNM (Single, x4) -define { , , , } -@multi_vec_maxnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -889,8 +867,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -908,8 +885,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_maxnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_maxnm_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -976,19 +952,18 @@ define { , } @multi_vec_maxnm_x2_f64( ; FMAXNM (Multi, x4) -define { , , , } -@multi_vec_maxnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -996,24 +971,23 @@ define { , , , , , , } - @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, + @llvm.aarch64.sve.fmaxnm.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_maxnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1026,19 +1000,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_maxnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_maxnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_maxnm_x4_f64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmaxnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll index 21a55c6436acd..363f9ba5d3530 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SMIN (Single, x2) @@ -151,8 +152,7 @@ define { , } @multi_vec_min_single_x2 ; SMIN (Single, x4) -define { , , , } -@multi_vec_min_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -170,8 +170,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -189,8 +188,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -208,8 +206,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -229,8 +226,7 @@ define { , , , , , , } -@multi_vec_min_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -248,8 +244,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -267,8 +262,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -286,8 +280,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -307,8 +300,7 @@ define { , , , , , , } -@multi_vec_min_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -326,8 +318,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -345,8 +336,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_min_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_min_single_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_min_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -537,104 +527,100 @@ define { , } @multi_vec_min_multi_x2_ ; SMIN (Multi, x4) -define { , , , } -@multi_vec_min_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_s64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: smin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.smin.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -643,104 +629,100 @@ define { , , , , , , } -@multi_vec_min_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_u64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_u64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: umin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.umin.x4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -749,78 +731,75 @@ define { , , , , , , } -@multi_vec_min_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv8f16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv4f32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_min_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_min_multi_x4_f64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_min_multi_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fmin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.fmin.x4.nxv2f64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) @@ -870,8 +849,7 @@ define { , } @multi_vec_minnm_single ; FMINNM (Single, x4) -define { , , , } -@multi_vec_minnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -889,8 +867,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -908,8 +885,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_minnm_single_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_minnm_single_x4_f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -976,19 +952,18 @@ define { , } @multi_vec_minnm_x2_f64( ; FMINNM (Multi, x4) -define { , , , } -@multi_vec_minnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1001,19 +976,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -1026,19 +1000,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_minnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_minnm_x4_f64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_minnm_x4_f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z30.d, z7.d ; CHECK-NEXT: mov z27.d, z4.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z29.d, z6.d ; CHECK-NEXT: mov z26.d, z3.d ; CHECK-NEXT: mov z28.d, z5.d ; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: fminnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll index f766bfcff4d1d..346afc611eb75 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll @@ -142,16 +142,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -164,16 +164,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -418,16 +418,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -440,16 +440,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -694,16 +694,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -716,16 +716,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -970,16 +970,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret @@ -992,16 +992,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1h { z27.h }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h } ; CHECK-NEXT: ret @@ -1275,16 +1275,16 @@ define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, %dummy, %zn0, %zn1, %zn2, %zn3, %zm0, %zm1, %zm2, %zm3) { ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z26.d, z7.d ; CHECK-NEXT: mov z31.d, z4.d -; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: mov z25.d, z6.d ; CHECK-NEXT: mov z30.d, z3.d ; CHECK-NEXT: mov z24.d, z5.d ; CHECK-NEXT: mov z29.d, z2.d -; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: ld1b { z27.b }, p0/z, [x1] +; CHECK-NEXT: mov z28.d, z1.d ; CHECK-NEXT: usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b } ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll index d138a3af43852..12a940ff03e29 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SRSHL (Single, x2) @@ -56,8 +57,7 @@ define { , } @multi_vec_rounding_shl_single ; SRSHL (Single, x4) -define { , , , } -@multi_vec_rounding_shl_single_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -75,8 +75,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -94,8 +93,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -113,8 +111,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -188,8 +185,7 @@ define { , } @multi_vec_rounding_shl_single ; URSHL (Single, x4) -define { , , , } -@multi_vec_rounding_shl_single_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -207,8 +203,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -226,8 +221,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -245,8 +239,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_single_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_rounding_shl_single_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -328,19 +321,18 @@ define { , } @multi_vec_rounding_shl_x2_s64 ; SRSHL (Multi, x4) -define { , , , } -@multi_vec_rounding_shl_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -353,19 +345,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -378,19 +369,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -403,19 +393,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_s64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -492,19 +481,18 @@ define { , } @multi_vec_rounding_uhl_x2_u64 ; URSHL (Multi, x4) -define { , , , } -@multi_vec_rounding_shl_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u8( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -517,19 +505,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u16( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -542,19 +529,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u32( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d @@ -567,19 +553,18 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_rounding_shl_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_rounding_shl_x4_u64( %dummy, %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) { ; CHECK-LABEL: multi_vec_rounding_shl_x4_u64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll index 9c5dff6c3bf6f..e71afe213d8a5 100644 --- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s ; SQDMULH (Single, x2) @@ -56,8 +57,7 @@ define { , } @multi_vec_sat_double_mulh_sin ; SQDMULH (Single, x4) -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -75,8 +75,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -94,8 +93,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -113,8 +111,7 @@ define { , , , , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { +define { , , , } @multi_vec_sat_double_mulh_single_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, %zm) { ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z27.d, z4.d @@ -196,104 +193,100 @@ define { , } @multi_vec_sat_double_mulh_mul ; SQDMULH (x4, Multi) -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s8( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s8: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1b { z31.b }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s16( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s16: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1h { z31.h }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s32( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s32: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1w { z31.s }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) ret { , , , } %res } -define { , , , } -@multi_vec_sat_double_mulh_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, - %zm1, %zm2, %zm3, %zm4) { +define { , , , } @multi_vec_sat_double_mulh_multi_x4_s64( %unused, %zdn1, %zdn2, %zdn3, %zdn4, ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s64: ; CHECK: // %bb.0: +; CHECK-NEXT: mov z30.d, z7.d +; CHECK-NEXT: mov z27.d, z4.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z30.d, z7.d -; CHECK-NEXT: mov z27.d, z4.d -; CHECK-NEXT: mov z29.d, z6.d -; CHECK-NEXT: mov z26.d, z3.d -; CHECK-NEXT: mov z28.d, z5.d -; CHECK-NEXT: mov z25.d, z2.d -; CHECK-NEXT: mov z24.d, z1.d +; CHECK-NEXT: mov z29.d, z6.d +; CHECK-NEXT: mov z26.d, z3.d +; CHECK-NEXT: mov z28.d, z5.d +; CHECK-NEXT: mov z25.d, z2.d ; CHECK-NEXT: ld1d { z31.d }, p0/z, [x0] +; CHECK-NEXT: mov z24.d, z1.d ; CHECK-NEXT: sqdmulh { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d } ; CHECK-NEXT: mov z0.d, z24.d ; CHECK-NEXT: mov z1.d, z25.d ; CHECK-NEXT: mov z2.d, z26.d ; CHECK-NEXT: mov z3.d, z27.d ; CHECK-NEXT: ret + %zm1, %zm2, %zm3, %zm4) { %res = call { , , , } @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64( %zdn1, %zdn2, %zdn3, %zdn4, %zm1, %zm2, %zm3, %zm4) diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll index a507296338f93..b8ab9a00c6981 100644 --- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll +++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc < %s -debug-only=legalize-types 2>&1 | FileCheck %s --check-prefix=CHECK-LEGALIZATION ; RUN: llc < %s | FileCheck %s ; REQUIRES: asserts @@ -9,54 +10,94 @@ declare @llvm.vector.insert.nxv2i64.v8i64(, declare @llvm.vector.insert.nxv2f64.v8f64(, <8 x double>, i64) define @test_nxv2i64_v8i64( %a, <8 x i64> %b) #0 { -; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2i64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0> -; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2i64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2> -; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2i64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4> -; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2i64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6> - +; CHECK-LEGALIZATION-LABEL: test_nxv2i64_v8i64: +; CHECK-LEGALIZATION: // %bb.0: +; CHECK-LEGALIZATION-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LEGALIZATION-NEXT: .cfi_offset w29, -16 +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #-3 +; CHECK-LEGALIZATION-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-LEGALIZATION-NEXT: cntd x8 +; CHECK-LEGALIZATION-NEXT: ptrue p0.d, vl2 +; CHECK-LEGALIZATION-NEXT: mov w9, #2 // =0x2 +; CHECK-LEGALIZATION-NEXT: sub x8, x8, #2 +; CHECK-LEGALIZATION-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-LEGALIZATION-NEXT: mov x10, sp +; CHECK-LEGALIZATION-NEXT: cmp x8, #2 +; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d +; CHECK-LEGALIZATION-NEXT: ptrue p0.d +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #4 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 +; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #6 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 +; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 0 +; CHECK-LEGALIZATION-NEXT: .cfi_restore w29 +; CHECK-LEGALIZATION-NEXT: ret +; ; CHECK-LABEL: test_nxv2i64_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov w9, #2 // =0x2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 // =0x4 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: ret +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 // =0x6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + @@ -66,54 +107,94 @@ define @test_nxv2i64_v8i64( %a, <8 x i64> % } define @test_nxv2f64_v8f64( %a, <8 x double> %b) #0 { -; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2f64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0> -; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2f64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2> -; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2f64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4> -; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2f64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6> - +; CHECK-LEGALIZATION-LABEL: test_nxv2f64_v8f64: +; CHECK-LEGALIZATION: // %bb.0: +; CHECK-LEGALIZATION-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LEGALIZATION-NEXT: .cfi_offset w29, -16 +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #-3 +; CHECK-LEGALIZATION-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-LEGALIZATION-NEXT: cntd x8 +; CHECK-LEGALIZATION-NEXT: ptrue p0.d, vl2 +; CHECK-LEGALIZATION-NEXT: mov w9, #2 // =0x2 +; CHECK-LEGALIZATION-NEXT: sub x8, x8, #2 +; CHECK-LEGALIZATION-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-LEGALIZATION-NEXT: mov x10, sp +; CHECK-LEGALIZATION-NEXT: cmp x8, #2 +; CHECK-LEGALIZATION-NEXT: mov z0.d, p0/m, z1.d +; CHECK-LEGALIZATION-NEXT: ptrue p0.d +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #4 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-LEGALIZATION-NEXT: str q2, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #4 // =0x4 +; CHECK-LEGALIZATION-NEXT: addvl x10, sp, #1 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-LEGALIZATION-NEXT: csel x9, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: cmp x8, #6 +; CHECK-LEGALIZATION-NEXT: lsl x9, x9, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: str q3, [x10, x9] +; CHECK-LEGALIZATION-NEXT: mov w9, #6 // =0x6 +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-LEGALIZATION-NEXT: csel x8, x8, x9, lo +; CHECK-LEGALIZATION-NEXT: addvl x9, sp, #2 +; CHECK-LEGALIZATION-NEXT: lsl x8, x8, #3 +; CHECK-LEGALIZATION-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: str q4, [x9, x8] +; CHECK-LEGALIZATION-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-LEGALIZATION-NEXT: addvl sp, sp, #3 +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-LEGALIZATION-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LEGALIZATION-NEXT: .cfi_def_cfa_offset 0 +; CHECK-LEGALIZATION-NEXT: .cfi_restore w29 +; CHECK-LEGALIZATION-NEXT: ret +; ; CHECK-LABEL: test_nxv2f64_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: addvl sp, sp, #-3 -; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG -; CHECK-NEXT: ptrue p1.d, vl2 -; CHECK-NEXT: cntd x8 -; CHECK-NEXT: mov w9, #2 // =0x2 -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sub x8, x8, #2 -; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: cmp x8, #2 -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #4 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: mov z0.d, p1/m, z1.d -; CHECK-NEXT: st1d { z0.d }, p0, [sp] -; CHECK-NEXT: str q2, [x10, x9] -; CHECK-NEXT: mov w9, #4 // =0x4 -; CHECK-NEXT: addvl x10, sp, #1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] -; CHECK-NEXT: csel x9, x8, x9, lo -; CHECK-NEXT: cmp x8, #6 -; CHECK-NEXT: lsl x9, x9, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] -; CHECK-NEXT: str q3, [x10, x9] -; CHECK-NEXT: mov w9, #6 // =0x6 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: addvl x9, sp, #2 -; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] -; CHECK-NEXT: str q4, [x9, x8] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] -; CHECK-NEXT: addvl sp, sp, #3 -; CHECK-NEXT: .cfi_def_cfa wsp, 16 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: .cfi_def_cfa_offset 0 -; CHECK-NEXT: .cfi_restore w29 -; CHECK-NEXT: ret +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-3 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: sub x8, x8, #2 +; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: mov z0.d, p0/m, z1.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: str q2, [x10, x9] +; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: addvl x10, sp, #1 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] +; CHECK-NEXT: csel x9, x8, x9, lo +; CHECK-NEXT: cmp x8, #6 +; CHECK-NEXT: lsl x9, x9, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #1, mul vl] +; CHECK-NEXT: str q3, [x10, x9] +; CHECK-NEXT: mov w9, #6 // =0x6 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: addvl x9, sp, #2 +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: st1d { z0.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: str q4, [x9, x8] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #2, mul vl] +; CHECK-NEXT: addvl sp, sp, #3 +; CHECK-NEXT: .cfi_def_cfa wsp, 16 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret + diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll index 1d9cb88260b60..c0c0ae5c9d1fe 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll @@ -111,6 +111,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_undef1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movi v3.4s, #25 ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -118,9 +119,8 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #3 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #25 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -134,6 +134,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_even_undef1: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #34079 // =0x851f +; CHECK-NEXT: movi v3.4s, #100 ; CHECK-NEXT: movk w8, #20971, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -141,9 +142,8 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #5 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #100 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s ; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret @@ -201,11 +201,11 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_int_min: ; CHECK: // %bb.0: -; CHECK-NEXT: cmlt v1.4s, v0.4s, #0 -; CHECK-NEXT: mov v2.16b, v0.16b -; CHECK-NEXT: usra v2.4s, v1.4s, #1 +; CHECK-NEXT: cmlt v2.4s, v0.4s, #0 +; CHECK-NEXT: mov v3.16b, v0.16b ; CHECK-NEXT: movi v1.4s, #128, lsl #24 -; CHECK-NEXT: and v1.16b, v2.16b, v1.16b +; CHECK-NEXT: usra v3.4s, v2.4s, #1 +; CHECK-NEXT: and v1.16b, v3.16b, v1.16b ; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: movi v1.4s, #1 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll index 0598af7c98063..a74f0c86fe185 100644 --- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll @@ -245,6 +245,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) { ; CHECK-LABEL: fold_srem_v4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #26215 // =0x6667 +; CHECK-NEXT: movi v3.4s, #10 ; CHECK-NEXT: movk w8, #26214, lsl #16 ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s @@ -252,8 +253,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) { ; CHECK-NEXT: uzp2 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: sshr v2.4s, v1.4s, #2 ; CHECK-NEXT: usra v2.4s, v1.4s, #31 -; CHECK-NEXT: movi v1.4s, #10 -; CHECK-NEXT: mls v0.4s, v2.4s, v1.4s +; CHECK-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-NEXT: ret %1 = srem <4 x i32> %x, ret <4 x i32> %1 diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll index 95aec0a492619..7b492229e3d23 100644 --- a/llvm/test/CodeGen/AArch64/sve-abd.ll +++ b/llvm/test/CodeGen/AArch64/sve-abd.ll @@ -24,10 +24,10 @@ define @sabd_b( %a, %b) define @sabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: sabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: sabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: sabd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %a.sext = sext %a to %b.sext = sext %b to @@ -144,10 +144,10 @@ define @uabd_b( %a, %b) define @uabd_b_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_b_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p1/z, #1 // =0x1 -; CHECK-NEXT: uabd z0.b, p2/m, z0.b, z1.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: uabd z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: ret %a.zext = zext %a to %b.zext = zext %b to @@ -173,9 +173,9 @@ define @uabd_h( %a, %b) define @uabd_h_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_h_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: and z1.h, z1.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: uabd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %a.zext = zext %a to @@ -202,9 +202,9 @@ define @uabd_s( %a, %b) define @uabd_s_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_s_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to @@ -231,9 +231,9 @@ define @uabd_d( %a, %b) define @uabd_d_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_d_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uabd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %a.zext = zext %a to @@ -248,8 +248,8 @@ define @uabd_d_promoted_ops( %a, @uabd_non_matching_extension( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_extension: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to @@ -265,9 +265,9 @@ define @uabd_non_matching_extension( %a, @uabd_non_matching_promoted_ops( %a, %b) #0 { ; CHECK-LABEL: uabd_non_matching_promoted_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: and z1.s, z1.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uabd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %a.zext = zext %a to diff --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll index 7dd568fc837a3..95f43ba512632 100644 --- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll +++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll @@ -1698,8 +1698,8 @@ define @bitcast_nxv8i8_to_nxv1i64( %v) #0 { ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1b { z0.b }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -1720,8 +1720,8 @@ define @bitcast_nxv4i16_to_nxv1i64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1h { z0.h }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -1742,8 +1742,8 @@ define @bitcast_nxv2i32_to_nxv1i64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1w { z0.s }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2218,8 +2218,8 @@ define @bitcast_nxv8i8_to_nxv1f64( %v) #0 ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK_BE-NEXT: ptrue p0.b ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1b { z0.b }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2240,8 +2240,8 @@ define @bitcast_nxv4i16_to_nxv1f64( %v) ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK_BE-NEXT: ptrue p0.h ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1h { z0.h }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2262,8 +2262,8 @@ define @bitcast_nxv2i32_to_nxv1f64( %v) ; CHECK_BE: // %bb.0: ; CHECK_BE-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK_BE-NEXT: addvl sp, sp, #-1 -; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ptrue p1.d ; CHECK_BE-NEXT: st1w { z0.s }, p0, [sp] ; CHECK_BE-NEXT: ld1d { z0.d }, p1/z, [sp] @@ -2827,11 +2827,11 @@ define @bitcast_nxv2f16_to_nxv1i32( %v) #0 ; CHECK_BE-NEXT: addvl sp, sp, #-2 ; CHECK_BE-NEXT: ptrue p0.d ; CHECK_BE-NEXT: ptrue p1.h -; CHECK_BE-NEXT: ptrue p2.s ; CHECK_BE-NEXT: st1h { z0.d }, p0, [sp] +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ld1h { z0.h }, p1/z, [sp] ; CHECK_BE-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] -; CHECK_BE-NEXT: ld1w { z0.s }, p2/z, [sp, #1, mul vl] +; CHECK_BE-NEXT: ld1w { z0.s }, p0/z, [sp, #1, mul vl] ; CHECK_BE-NEXT: addvl sp, sp, #2 ; CHECK_BE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK_BE-NEXT: ret @@ -2860,11 +2860,11 @@ define @bitcast_nxv2bf16_to_nxv1i32( %v) ; CHECK_BE-NEXT: addvl sp, sp, #-2 ; CHECK_BE-NEXT: ptrue p0.d ; CHECK_BE-NEXT: ptrue p1.h -; CHECK_BE-NEXT: ptrue p2.s ; CHECK_BE-NEXT: st1h { z0.d }, p0, [sp] +; CHECK_BE-NEXT: ptrue p0.s ; CHECK_BE-NEXT: ld1h { z0.h }, p1/z, [sp] ; CHECK_BE-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] -; CHECK_BE-NEXT: ld1w { z0.s }, p2/z, [sp, #1, mul vl] +; CHECK_BE-NEXT: ld1w { z0.s }, p0/z, [sp, #1, mul vl] ; CHECK_BE-NEXT: addvl sp, sp, #2 ; CHECK_BE-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK_BE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll index 56b023086ea24..3b7b03e6ef61f 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll @@ -64,12 +64,12 @@ define float @foo2(ptr %x0, ptr %x1) nounwind { ; CHECK-NEXT: add x9, sp, #16 ; CHECK-NEXT: mov w2, #2 // =0x2 ; CHECK-NEXT: mov w3, #3 // =0x3 +; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] +; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: mov w4, #4 // =0x4 ; CHECK-NEXT: mov w5, #5 // =0x5 ; CHECK-NEXT: mov w6, #6 // =0x6 ; CHECK-NEXT: mov w7, #7 // =0x7 -; CHECK-NEXT: ld4d { z1.d - z4.d }, p0/z, [x0] -; CHECK-NEXT: mov w0, wzr ; CHECK-NEXT: ld4d { z16.d - z19.d }, p0/z, [x1] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w1, #1 // =0x1 @@ -182,8 +182,8 @@ entry: define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, ptr %ptr1, ptr %ptr2, double %x0, %x1, %x2) nounwind { ; CHECK-LABEL: foo5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x8, #1, mul vl] ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x8] ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x8, #3, mul vl] @@ -229,10 +229,10 @@ entry: define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x7] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x9] ; CHECK-NEXT: st1w { z1.s }, p0, [x9] ; CHECK-NEXT: st1w { z2.s }, p0, [x9] @@ -261,12 +261,12 @@ entry: define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16,ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp x8, x9, [sp] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x6] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x5] ; CHECK-NEXT: ld1w { z5.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x4] @@ -299,11 +299,10 @@ entry: define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, %p0, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -311,15 +310,16 @@ define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, floa ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z24.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -339,8 +339,8 @@ entry: define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, i32 %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs4: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x9, [sp, #16] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x8] ; CHECK-NEXT: st1w { z0.s }, p0, [x9] @@ -371,11 +371,10 @@ entry: define @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aavpcs5: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -383,15 +382,16 @@ define @aavpcs5(float %s0, float %s1, float %s2, float %s3, ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z24.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z24.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z24.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -409,11 +409,10 @@ entry: define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15, %s16, %s17, ptr %ptr) nounwind { ; CHECK-LABEL: aapcs1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldr x8, [sp] -; CHECK-NEXT: ldr x9, [sp, #16] -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x8] +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z2.s }, p0/z, [x7] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x1] ; CHECK-NEXT: ld1w { z4.s }, p0/z, [x6] @@ -421,15 +420,16 @@ define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float ; CHECK-NEXT: ld1w { z6.s }, p0/z, [x2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x4] ; CHECK-NEXT: ld1w { z16.s }, p0/z, [x3] -; CHECK-NEXT: st1w { z0.s }, p0, [x9] -; CHECK-NEXT: st1w { z3.s }, p0, [x9] -; CHECK-NEXT: st1w { z6.s }, p0, [x9] -; CHECK-NEXT: st1w { z16.s }, p0, [x9] -; CHECK-NEXT: st1w { z7.s }, p0, [x9] -; CHECK-NEXT: st1w { z5.s }, p0, [x9] -; CHECK-NEXT: st1w { z4.s }, p0, [x9] -; CHECK-NEXT: st1w { z2.s }, p0, [x9] -; CHECK-NEXT: st1w { z1.s }, p0, [x9] +; CHECK-NEXT: ldr x8, [sp, #16] +; CHECK-NEXT: st1w { z1.s }, p0, [x8] +; CHECK-NEXT: st1w { z3.s }, p0, [x8] +; CHECK-NEXT: st1w { z6.s }, p0, [x8] +; CHECK-NEXT: st1w { z16.s }, p0, [x8] +; CHECK-NEXT: st1w { z7.s }, p0, [x8] +; CHECK-NEXT: st1w { z5.s }, p0, [x8] +; CHECK-NEXT: st1w { z4.s }, p0, [x8] +; CHECK-NEXT: st1w { z2.s }, p0, [x8] +; CHECK-NEXT: st1w { z0.s }, p0, [x8] ; CHECK-NEXT: ret entry: store volatile %s8, ptr %ptr @@ -486,13 +486,13 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa ; CHECK-NEXT: fmov s2, #2.00000000 ; CHECK-NEXT: fmov s3, #3.00000000 ; CHECK-NEXT: fmov s4, #4.00000000 -; CHECK-NEXT: fmov s5, #5.00000000 -; CHECK-NEXT: fmov s6, #6.00000000 -; CHECK-NEXT: fmov s7, #7.00000000 ; CHECK-NEXT: ld1w { z16.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z17.s }, p0/z, [x1] ; CHECK-NEXT: addvl x0, sp, #1 +; CHECK-NEXT: fmov s5, #5.00000000 +; CHECK-NEXT: fmov s6, #6.00000000 ; CHECK-NEXT: mov x1, sp +; CHECK-NEXT: fmov s7, #7.00000000 ; CHECK-NEXT: st1w { z17.s }, p0, [sp] ; CHECK-NEXT: st1w { z16.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: bl non_sve_callee_high_range @@ -548,20 +548,20 @@ define @sve_caller_non_sve_callee_high_range( %a, %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.s ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %vcond = fcmp oeq %a, %b diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll index f5721cd0fd793..7bc31d44bb654 100644 --- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll +++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll @@ -87,8 +87,8 @@ define float @fmaximum_f32( %a, %b) { define i32 @add_i32( %a, %b) { ; CHECK-LABEL: add_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: add z0.s, z0.s, z2.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov x0, d0 @@ -160,8 +160,8 @@ define i16 @add_ext_v32i16( %a, %b) { define i32 @and_i32( %a, %b) { ; CHECK-LABEL: and_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z2.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -175,8 +175,8 @@ define i32 @and_i32( %a, %b) { define i32 @or_i32( %a, %b) { ; CHECK-LABEL: or_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: orr z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: orr z0.d, z0.d, z2.d ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -190,8 +190,8 @@ define i32 @or_i32( %a, %b) { define i32 @xor_i32( %a, %b) { ; CHECK-LABEL: xor_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eor3 z0.d, z0.d, z1.d, z2.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll index fe5cdc9387728..180c64e0a7de1 100644 --- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll @@ -10,8 +10,8 @@ define @sdiv_i8( %a) #0 { ; CHECK-LABEL: sdiv_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, #86 // =0x56 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: lsr z1.b, z0.b, #7 ; CHECK-NEXT: add z0.b, z0.b, z1.b @@ -23,8 +23,8 @@ define @sdiv_i8( %a) #0 { define @sdiv_i16( %a) #0 { ; CHECK-LABEL: sdiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #21846 // =0x5556 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z1.h, z0.h, #15 @@ -37,8 +37,8 @@ define @sdiv_i16( %a) #0 { define @sdiv_i32( %a) #0 { ; CHECK-LABEL: sdiv_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #21846 // =0x5556 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #21845, lsl #16 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s @@ -52,8 +52,8 @@ define @sdiv_i32( %a) #0 { define @sdiv_i64( %a) #0 { ; CHECK-LABEL: sdiv_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #21846 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z1.d @@ -71,8 +71,8 @@ define @sdiv_i64( %a) #0 { define @udiv_i8( %a) #0 { ; CHECK-LABEL: udiv_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z1.b, #-85 // =0xffffffffffffffab +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: lsr z0.b, z0.b, #1 ; CHECK-NEXT: ret @@ -83,8 +83,8 @@ define @udiv_i8( %a) #0 { define @udiv_i16( %a) #0 { ; CHECK-LABEL: udiv_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #-21845 // =0xffffaaab +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z0.h, z0.h, #1 @@ -96,8 +96,8 @@ define @udiv_i16( %a) #0 { define @udiv_i32( %a) #0 { ; CHECK-LABEL: udiv_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #43691 // =0xaaab +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s @@ -110,8 +110,8 @@ define @udiv_i32( %a) #0 { define @udiv_i64( %a) #0 { ; CHECK-LABEL: udiv_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movk x8, #43691 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll index a3c34b53baa07..6d4f5963881e5 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll @@ -616,8 +616,8 @@ define i1 @test_last_8xi1( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: mov x8, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: whilels p1.h, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.h ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() @@ -630,10 +630,10 @@ define i1 @test_last_8xi1( %a) #0 { define i1 @test_lanex_4xi1( %a, i32 %x) #0 { ; CHECK-LABEL: test_lanex_4xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: whilels p1.s, xzr, x8 -; CHECK-NEXT: lastb w8, p1, z0.s +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb w8, p0, z0.s ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret %b = extractelement %a, i32 %x diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll index bc1c563810f35..b9c531fe33526 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll @@ -100,16 +100,16 @@ define <2 x i64> @extract_v2i64_nxv8i64_8( %arg) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] @@ -183,10 +183,10 @@ define <4 x i1> @extract_v4i1_nxv32i1_16( %arg) { ; CHECK-NEXT: addvl sp, sp, #-8 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: add x8, x8, #16 ; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] ; CHECK-NEXT: st1b { z1.b }, p2, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll index e2f8dad03ef6f..88268104889fd 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll @@ -17,15 +17,15 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -50,15 +50,15 @@ define <4 x i32> @extract_v4i32_nxv4i32_idx4( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -115,15 +115,15 @@ define <8 x i16> @extract_v8i16_nxv8i16_idx8( %vec) nounwind { ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -214,14 +214,14 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16( %vec) nounwind ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: sub x8, x8, #16 ; CHECK-NEXT: cmp x8, #16 +; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: ldr q0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll index e60a2f142922f..3c0bd501f45d8 100644 --- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll @@ -65,27 +65,25 @@ define @extract_nxv14i1_nxv28i1_14( %in) uw ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: punpkhi p2.h, p1.b -; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: punpklo p2.h, p2.b ; CHECK-NEXT: punpkhi p3.h, p1.b -; CHECK-NEXT: punpklo p1.h, p1.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpkhi p4.h, p2.b ; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpkhi p5.h, p3.b -; CHECK-NEXT: punpklo p3.h, p3.b -; CHECK-NEXT: punpkhi p6.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uzp1 p2.s, p5.s, p2.s +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p5.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p3.s, p5.s, p3.s ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p3.s, p6.s, p3.s -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p4.s, p4.s, p0.s ; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s ; CHECK-NEXT: uzp1 p1.h, p2.h, p4.h ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll index f7e3b6d0171ac..35cbe65c6a8b8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll @@ -374,8 +374,8 @@ define @one_zero( %x) { define @ueq_zero( %x) { ; CHECK-LABEL: ueq_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll index f15807597ac21..78843e392e536 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll @@ -63,10 +63,10 @@ define @test_copysign_v4f32_v4f64( %a, ; CHECK-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-EXTEND-ROUND-NEXT: uunpkhi z3.d, z0.s ; CHECK-EXTEND-ROUND-NEXT: uunpklo z0.d, z0.s -; CHECK-EXTEND-ROUND-NEXT: and z3.s, z3.s, #0x7fffffff -; CHECK-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-EXTEND-ROUND-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK-EXTEND-ROUND-NEXT: fcvt z1.s, p0/m, z1.d +; CHECK-EXTEND-ROUND-NEXT: and z3.s, z3.s, #0x7fffffff +; CHECK-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff ; CHECK-EXTEND-ROUND-NEXT: and z2.s, z2.s, #0x80000000 ; CHECK-EXTEND-ROUND-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-EXTEND-ROUND-NEXT: orr z2.d, z3.d, z2.d @@ -115,9 +115,9 @@ declare @llvm.copysign.v2f64( %a, @test_copysign_v4f64_v4f32( %a, %b) #0 { ; CHECK-LABEL: test_copysign_v4f64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z3.d, z2.s ; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; CHECK-NEXT: and z1.d, z1.d, #0x7fffffffffffffff ; CHECK-NEXT: fcvt z3.d, p0/m, z3.s @@ -193,10 +193,10 @@ define @test_copysign_v4f16_v4f64( %a, @test_copysign_v8f16_v8f32( %a, @test_copysign_v8f16_v8f32( %a, @test_copysign_nxv4f32_nxv4f16( %a, %b) #0 { ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16: ; CHECK-NO-EXTEND-ROUND: // %bb.0: -; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.s ; CHECK-NO-EXTEND-ROUND-NEXT: and z1.s, z1.s, #0x80000000 ; CHECK-NO-EXTEND-ROUND-NEXT: and z0.s, z0.s, #0x7fffffff +; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.s ; CHECK-NO-EXTEND-ROUND-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NO-EXTEND-ROUND-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NO-EXTEND-ROUND-NEXT: ret @@ -285,9 +285,9 @@ define @test_copysign_nxv4f32_nxv4f16( % define @test_copysign_nxv2f64_nxv2f32( %a, %b) #0 { ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32: ; CHECK-NO-EXTEND-ROUND: // %bb.0: -; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-NO-EXTEND-ROUND-NEXT: and z1.d, z1.d, #0x8000000000000000 ; CHECK-NO-EXTEND-ROUND-NEXT: and z0.d, z0.d, #0x7fffffffffffffff +; CHECK-NO-EXTEND-ROUND-NEXT: ptrue p0.d ; CHECK-NO-EXTEND-ROUND-NEXT: orr z0.d, z0.d, z1.d ; CHECK-NO-EXTEND-ROUND-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NO-EXTEND-ROUND-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll index 0fe38bf9ae718..fc5128fffad36 100644 --- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll @@ -454,9 +454,9 @@ define @fcvtzu_d_nxv2f64( %a) { define @scvtf_h_nxv2i1( %a) { ; CHECK-LABEL: scvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.h, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -495,9 +495,9 @@ define @scvtf_h_nxv2i64( %a) { define @scvtf_h_nxv3i1( %a) { ; CHECK-LABEL: scvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -516,9 +516,9 @@ define @scvtf_h_nxv3i16( %a) { define @scvtf_h_nxv4i1( %a) { ; CHECK-LABEL: scvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -547,9 +547,9 @@ define @scvtf_h_nxv4i32( %a) { define @scvtf_h_nxv7i1( %a) { ; CHECK-LABEL: scvtf_h_nxv7i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -568,9 +568,9 @@ define @scvtf_h_nxv7i16( %a) { define @scvtf_h_nxv8i1( %a) { ; CHECK-LABEL: scvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -589,9 +589,9 @@ define @scvtf_h_nxv8i16( %a) { define @scvtf_s_nxv2i1( %a) { ; CHECK-LABEL: scvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -620,9 +620,9 @@ define @scvtf_s_nxv2i64( %a) { define @scvtf_s_nxv3i1( %a) { ; CHECK-LABEL: scvtf_s_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -641,9 +641,9 @@ define @scvtf_s_nxv3i32( %a) { define @scvtf_s_nxv4i1( %a) { ; CHECK-LABEL: scvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -662,9 +662,9 @@ define @scvtf_s_nxv4i32( %a) { define @scvtf_d_nxv2i1( %a) { ; CHECK-LABEL: scvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: scvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %res = sitofp %a to ret %res @@ -695,9 +695,9 @@ define @scvtf_d_nxv2i64( %a) { define @ucvtf_h_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -736,9 +736,9 @@ define @ucvtf_h_nxv2i64( %a) { define @ucvtf_h_nxv3i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv3i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -767,9 +767,9 @@ define @ucvtf_h_nxv3i32( %a) { define @ucvtf_h_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -798,9 +798,9 @@ define @ucvtf_h_nxv4i32( %a) { define @ucvtf_h_nxv8i1( %a) { ; CHECK-LABEL: ucvtf_h_nxv8i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.h, p1/m, z0.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -819,9 +819,9 @@ define @ucvtf_h_nxv8i16( %a) { define @ucvtf_s_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -850,9 +850,9 @@ define @ucvtf_s_nxv2i64( %a) { define @ucvtf_s_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_s_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.s, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ret %res = uitofp %a to ret %res @@ -871,9 +871,9 @@ define @ucvtf_s_nxv4i32( %a) { define @ucvtf_d_nxv2i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv2i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 -; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %res = uitofp %a to ret %res diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll index ed7ea657874a4..28e1412c524a0 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: index z0.s, #0, #7 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1, z0.s, sxtw #2] ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -21,8 +21,8 @@ define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 { define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 { ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov x8, #-32 // =0xffffffffffffffe0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: index z0.d, #-2, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll index ad482118ec0bb..47fda39d84001 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @build_vector_7_inc1_v32i8(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_7_inc1_v32i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: index z0.b, #7, #1 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <32 x i8> , ptr %a, align 1 @@ -18,8 +18,8 @@ define void @build_vector_7_inc1_v32i8(ptr %a) #0 { define void @build_vector_0_inc2_v16i16(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_0_inc2_v16i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: index z0.h, #0, #2 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <16 x i16> , ptr %a, align 2 @@ -30,8 +30,8 @@ define void @build_vector_0_inc2_v16i16(ptr %a) #0 { define void @build_vector_0_dec3_v8i32(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_0_dec3_v8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: index z0.s, #0, #-3 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret store <8 x i32> , ptr %a, align 4 @@ -42,8 +42,8 @@ define void @build_vector_0_dec3_v8i32(ptr %a) #0 { define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #-32 // =0xffffffffffffffe0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: index z0.d, #-2, x8 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -53,11 +53,6 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 { ; Constant but not a sequence. define void @build_vector_no_stride_v4i64(ptr %a) #0 { -; VBITS_GE_256-LABEL: .LCPI4_0: -; VBITS_GE_256: .xword 0 -; VBITS_GE_256-NEXT: .xword 4 -; VBITS_GE_256-NEXT: .xword 1 -; VBITS_GE_256-NEXT: .xword 8 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64: ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll index 65cb448cac117..f7751131005e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll @@ -38,9 +38,9 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.b, vl32 ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl32 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i8>, ptr %a %op2 = load <16 x i8>, ptr %b @@ -66,11 +66,11 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v64i8: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.b, vl32 -; VBITS_GE_512-NEXT: ptrue p1.b, vl64 ; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.b, p0, z0.b, z1.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x i8>, ptr %a %op2 = load <32 x i8>, ptr %b @@ -90,11 +90,11 @@ define void @concat_v128i8(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl64 -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl128 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x i8>, ptr %a %op2 = load <64 x i8>, ptr %b @@ -122,11 +122,11 @@ define void @concat_v256i8(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v256i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ptrue p1.b, vl256 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2] +; CHECK-NEXT: ptrue p0.b, vl256 +; CHECK-NEXT: st1b { z0.b }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -198,9 +198,9 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i16>, ptr %a %op2 = load <8 x i16>, ptr %b @@ -224,11 +224,11 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v32i16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x i16>, ptr %a %op2 = load <16 x i16>, ptr %b @@ -244,11 +244,11 @@ define void @concat_v64i16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v64i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i16>, ptr %a %op2 = load <32 x i16>, ptr %b @@ -268,11 +268,11 @@ define void @concat_v128i16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v128i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x i16>, ptr %a %op2 = load <64 x i16>, ptr %b @@ -328,9 +328,9 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.s, vl8 ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x i32>, ptr %a %op2 = load <4 x i32>, ptr %b @@ -353,11 +353,11 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v16i32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x i32>, ptr %a %op2 = load <8 x i32>, ptr %b @@ -371,11 +371,11 @@ define void @concat_v32i32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v32i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i32>, ptr %a %op2 = load <16 x i32>, ptr %b @@ -391,11 +391,11 @@ define void @concat_v64i32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v64i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x i32>, ptr %a %op2 = load <32 x i32>, ptr %b @@ -433,9 +433,9 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.d, vl4 ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <2 x i64>, ptr %a %op2 = load <2 x i64>, ptr %b @@ -458,11 +458,11 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v8i64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <4 x i64>, ptr %a %op2 = load <4 x i64>, ptr %b @@ -475,11 +475,11 @@ define void @concat_v16i64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v16i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x i64>, ptr %a %op2 = load <8 x i64>, ptr %b @@ -493,11 +493,11 @@ define void @concat_v32i64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v32i64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x i64>, ptr %a %op2 = load <16 x i64>, ptr %b @@ -541,9 +541,9 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl16 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x half>, ptr %a %op2 = load <8 x half>, ptr %b @@ -567,11 +567,11 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v32f16: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.h, vl16 -; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.h, p0, z0.h, z1.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x half>, ptr %a %op2 = load <16 x half>, ptr %b @@ -587,11 +587,11 @@ define void @concat_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v64f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl32 -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl64 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x half>, ptr %a %op2 = load <32 x half>, ptr %b @@ -611,11 +611,11 @@ define void @concat_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v128f16: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1h { z0.h }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %op2 = load <64 x half>, ptr %b @@ -671,9 +671,9 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.s, vl8 ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl8 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <4 x float>, ptr %a %op2 = load <4 x float>, ptr %b @@ -696,11 +696,11 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v16f32: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.s, vl8 -; VBITS_GE_512-NEXT: ptrue p1.s, vl16 ; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.s, p0, z0.s, z1.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x float>, ptr %a %op2 = load <8 x float>, ptr %b @@ -714,11 +714,11 @@ define void @concat_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v32f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl16 -; CHECK-NEXT: ptrue p1.s, vl32 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl32 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x float>, ptr %a %op2 = load <16 x float>, ptr %b @@ -734,11 +734,11 @@ define void @concat_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v64f32: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 -; CHECK-NEXT: ptrue p1.s, vl64 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2] +; CHECK-NEXT: ptrue p0.s, vl64 +; CHECK-NEXT: st1w { z0.s }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %op2 = load <32 x float>, ptr %b @@ -776,9 +776,9 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] -; CHECK-NEXT: ptrue p1.d, vl4 ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <2 x double>, ptr %a %op2 = load <2 x double>, ptr %b @@ -801,11 +801,11 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 { ; VBITS_GE_512-LABEL: concat_v8f64: ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl4 -; VBITS_GE_512-NEXT: ptrue p1.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: splice z0.d, p0, z0.d, z1.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x2] +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] ; VBITS_GE_512-NEXT: ret %op1 = load <4 x double>, ptr %a %op2 = load <4 x double>, ptr %b @@ -818,11 +818,11 @@ define void @concat_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 { ; CHECK-LABEL: concat_v16f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl8 -; CHECK-NEXT: ptrue p1.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl16 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <8 x double>, ptr %a %op2 = load <8 x double>, ptr %b @@ -836,11 +836,11 @@ define void @concat_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { ; CHECK-LABEL: concat_v32f64: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 -; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2] +; CHECK-NEXT: ptrue p0.d, vl32 +; CHECK-NEXT: st1d { z0.d }, p0, [x2] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %op2 = load <16 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll index 485124c1d59ed..ad4efeaf39247 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll @@ -70,9 +70,9 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: whilels p1.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: lastb h0, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %r = extractelement <64 x half> %op1, i64 63 @@ -84,9 +84,9 @@ define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov w8, #127 // =0x7f -; CHECK-NEXT: whilels p1.h, xzr, x8 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] -; CHECK-NEXT: lastb h0, p1, z0.h +; CHECK-NEXT: whilels p0.h, xzr, x8 +; CHECK-NEXT: lastb h0, p0, z0.h ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %r = extractelement <128 x half> %op1, i64 127 @@ -154,9 +154,9 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: lastb s0, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %r = extractelement <32 x float> %op1, i64 31 @@ -168,9 +168,9 @@ define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f -; CHECK-NEXT: whilels p1.s, xzr, x8 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] -; CHECK-NEXT: lastb s0, p1, z0.s +; CHECK-NEXT: whilels p0.s, xzr, x8 +; CHECK-NEXT: lastb s0, p0, z0.s ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %r = extractelement <64 x float> %op1, i64 63 @@ -236,9 +236,9 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf -; CHECK-NEXT: whilels p1.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: lastb d0, p1, z0.d +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %r = extractelement <16 x double> %op1, i64 15 @@ -250,9 +250,9 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f -; CHECK-NEXT: whilels p1.d, xzr, x8 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: lastb d0, p1, z0.d +; CHECK-NEXT: whilels p0.d, xzr, x8 +; CHECK-NEXT: lastb d0, p0, z0.d ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %r = extractelement <32 x double> %op1, i64 31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll index bca3dfe5717ef..e77cd9ef55eaf 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll @@ -396,9 +396,9 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mvni v1.4s, #128, lsl #24 ; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.s, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-NEXT: str q0, [x0] @@ -450,12 +450,12 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 ; CHECK_EXTEND_ROUND-NEXT: ldr q0, [x1] +; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d, vl4 ; CHECK_EXTEND_ROUND-NEXT: uunpklo z0.d, z0.s -; CHECK_EXTEND_ROUND-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK_EXTEND_ROUND-NEXT: ld1d { z1.d }, p0/z, [x0] ; CHECK_EXTEND_ROUND-NEXT: and z1.d, z1.d, #0x7fffffffffffffff +; CHECK_EXTEND_ROUND-NEXT: fcvt z0.d, p0/m, z0.s ; CHECK_EXTEND_ROUND-NEXT: and z0.d, z0.d, #0x8000000000000000 ; CHECK_EXTEND_ROUND-NEXT: orr z0.d, z1.d, z0.d ; CHECK_EXTEND_ROUND-NEXT: st1d { z0.d }, p0, [x0] @@ -494,9 +494,9 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mvni v1.4h, #128, lsl #8 ; CHECK-NEXT: ldr d2, [x0] -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.h, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: bit v0.8b, v2.8b, v1.8b @@ -521,9 +521,9 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mvni v1.8h, #128, lsl #8 ; CHECK-NEXT: ldr q2, [x0] -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: bit v0.16b, v2.16b, v1.16b ; CHECK-NEXT: str q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll index 6da07b855a5c5..b60988be1e76c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll @@ -462,11 +462,11 @@ define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.d ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: fcvt z0.h, p1/m, z0.d -; VBITS_GE_256-NEXT: fcvt z1.h, p1/m, z1.d +; VBITS_GE_256-NEXT: ptrue p0.d +; VBITS_GE_256-NEXT: fcvt z0.h, p0/m, z0.d +; VBITS_GE_256-NEXT: fcvt z1.h, p0/m, z1.d ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll index 13ebda1df7f9d..d1e9dc13f50e8 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll @@ -34,15 +34,15 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vsca define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x half>, ptr %a %op2 = load volatile <16 x half>, ptr %b @@ -54,33 +54,33 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w2 +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h -; VBITS_GE_256-NEXT: mov z0.h, p1/m, z2.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_512-NEXT: sel z0.h, p1, z1.h, z2.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x half>, ptr %a %op2 = load volatile <32 x half>, ptr %b @@ -92,15 +92,15 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x half>, ptr %a %op2 = load volatile <64 x half>, ptr %b @@ -112,15 +112,15 @@ define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x half>, ptr %a %op2 = load volatile <128 x half>, ptr %b @@ -158,15 +158,15 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x float>, ptr %a %op2 = load volatile <8 x float>, ptr %b @@ -178,33 +178,33 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z0.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s -; VBITS_GE_256-NEXT: mov z0.s, p1/m, z2.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_512-NEXT: sel z0.s, p1, z1.s, z2.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x float>, ptr %a %op2 = load volatile <16 x float>, ptr %b @@ -216,15 +216,15 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x float>, ptr %a %op2 = load volatile <32 x float>, ptr %b @@ -236,15 +236,15 @@ define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x float>, ptr %a %op2 = load volatile <64 x float>, ptr %b @@ -282,16 +282,16 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x double>, ptr %a %op2 = load volatile <4 x double>, ptr %b @@ -303,35 +303,35 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z0.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d -; VBITS_GE_256-NEXT: mov z0.d, p1/m, z2.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_512-NEXT: sel z0.d, p1, z1.d, z2.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x double>, ptr %a %op2 = load volatile <8 x double>, ptr %b @@ -343,16 +343,16 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x double>, ptr %a %op2 = load volatile <16 x double>, ptr %b @@ -364,16 +364,16 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x double>, ptr %a %op2 = load volatile <32 x double>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll index da0cf927d74d2..af54b146c5b66 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll @@ -131,8 +131,8 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 { define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzu_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -357,7 +357,6 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.s @@ -366,7 +365,8 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16: @@ -532,8 +532,8 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 { define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzu_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -752,7 +752,6 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzu z0.d, p0/m, z0.d @@ -761,7 +760,8 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32: @@ -1024,8 +1024,8 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 { define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzs_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -1250,7 +1250,6 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzs z0.s, p0/m, z0.s @@ -1259,7 +1258,8 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16: @@ -1425,8 +1425,8 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 { define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: fcvtzs_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -1645,7 +1645,6 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: fcvtzs z0.d, p0/m, z0.d @@ -1654,7 +1653,8 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll index 115f722986b5c..61e04682fa0bf 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll @@ -85,10 +85,10 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: str q0, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Folded Reload ; CHECK-NEXT: bl __trunctfdf2 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #128 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload ; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d @@ -111,14 +111,14 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Folded Reload ; CHECK-NEXT: bl __trunctfdf2 ; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Folded Reload -; CHECK-NEXT: ptrue p1.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: add x8, sp, #128 -; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov v0.d[1], v1.d[0] ; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload ; CHECK-NEXT: mov x8, #4 // =0x4 -; CHECK-NEXT: splice z0.d, p1, z0.d, z1.d +; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x19, x8, lsl #3] ; CHECK-NEXT: add x8, sp, #128 ; CHECK-NEXT: ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll index a4b5ccd69fdb7..1bd688d23050b 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll @@ -21,24 +21,24 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w29, -48 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: add x11, sp, #176 -; CHECK-NEXT: add x10, sp, #144 -; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: add x10, sp, #176 ; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: add x9, sp, #144 ; CHECK-NEXT: add x20, sp, #176 -; CHECK-NEXT: ldp x13, x12, [sp, #328] ; CHECK-NEXT: ldr x15, [sp, #104] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x10] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] +; CHECK-NEXT: add x8, sp, #112 +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x9] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x8] ; CHECK-NEXT: ldur q4, [sp, #88] -; CHECK-NEXT: ldp x16, x17, [sp, #208] +; CHECK-NEXT: ldp x9, x8, [sp, #328] ; CHECK-NEXT: ldr x19, [sp, #272] +; CHECK-NEXT: ldp x11, x10, [sp, #312] +; CHECK-NEXT: ldp x13, x12, [sp, #296] +; CHECK-NEXT: ldp x18, x14, [sp, #280] +; CHECK-NEXT: ldp x16, x17, [sp, #208] ; CHECK-NEXT: ldp x21, x22, [sp, #352] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x11] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x10] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x9] -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8] -; CHECK-NEXT: ldp x8, x14, [sp, #312] -; CHECK-NEXT: ldp x10, x9, [sp, #296] -; CHECK-NEXT: ldp x18, x11, [sp, #280] ; CHECK-NEXT: st1d { z3.d }, p0, [x20] ; CHECK-NEXT: add x20, sp, #144 ; CHECK-NEXT: st1d { z2.d }, p0, [x20] @@ -53,10 +53,10 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v ; CHECK-NEXT: stp x16, x17, [sp, #208] ; CHECK-NEXT: stur q4, [sp, #88] ; CHECK-NEXT: str x15, [sp, #104] -; CHECK-NEXT: stp x11, x10, [sp, #288] -; CHECK-NEXT: stp x9, x8, [sp, #304] -; CHECK-NEXT: stp x14, x13, [sp, #320] -; CHECK-NEXT: str x12, [sp, #336] +; CHECK-NEXT: stp x14, x13, [sp, #288] +; CHECK-NEXT: stp x12, x11, [sp, #304] +; CHECK-NEXT: stp x10, x9, [sp, #320] +; CHECK-NEXT: str x8, [sp, #336] ; CHECK-NEXT: ldr x29, [sp], #48 // 8-byte Folded Reload ; CHECK-NEXT: b func2 ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14, ptr %v15, ptr %v16, diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll index 977c528e2583a..6f4d257039bca 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -36,16 +36,16 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 { define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl16 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x half>, ptr %a %r = insertelement <16 x half> %op1, half 5.0, i64 15 @@ -56,33 +56,33 @@ define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v32f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov w8, #15 // =0xf ; VBITS_GE_256-NEXT: index z0.h, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov z1.h, w8 +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: fmov h2, #5.00000000 -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.h, p1/m, h2 -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] -; VBITS_GE_256-NEXT: st1h { z3.h }, p0, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_256-NEXT: fmov h0, #5.00000000 +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: mov z1.h, p0/m, h0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov w8, #31 // =0x1f ; VBITS_GE_512-NEXT: index z0.h, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h ; VBITS_GE_512-NEXT: mov z1.h, w8 -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; VBITS_GE_512-NEXT: fmov h0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.h, p1/m, h0 -; VBITS_GE_512-NEXT: st1h { z2.h }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 +; VBITS_GE_512-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov h1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.h, p0/m, h1 +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <32 x half>, ptr %a %r = insertelement <32 x half> %op1, half 5.0, i64 31 @@ -93,16 +93,16 @@ define void @insertelement_v32f16(ptr %a, ptr %b) #0 { define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl64 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <64 x half>, ptr %a %r = insertelement <64 x half> %op1, half 5.0, i64 63 @@ -113,16 +113,16 @@ define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov w8, #127 // =0x7f ; CHECK-NEXT: index z0.h, #0, #1 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.h, p1/z, z0.h, z1.h -; CHECK-NEXT: fmov h0, #5.00000000 -; CHECK-NEXT: mov z2.h, p1/m, h0 -; CHECK-NEXT: st1h { z2.h }, p0, [x1] +; CHECK-NEXT: ptrue p1.h, vl128 +; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: fmov h1, #5.00000000 +; CHECK-NEXT: mov z0.h, p0/m, h1 +; CHECK-NEXT: st1h { z0.h }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <128 x half>, ptr %a %r = insertelement <128 x half> %op1, half 5.0, i64 127 @@ -157,16 +157,16 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 { define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <8 x float>, ptr %a %r = insertelement <8 x float> %op1, float 5.0, i64 7 @@ -177,33 +177,33 @@ define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v16f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_256-NEXT: index z0.s, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z1.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: fmov s2, #5.00000000 -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.s, p1/m, s2 -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z3.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_256-NEXT: fmov s0, #5.00000000 +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: mov z1.s, p0/m, s0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov w8, #15 // =0xf ; VBITS_GE_512-NEXT: index z0.s, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z1.s, w8 -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; VBITS_GE_512-NEXT: fmov s0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.s, p1/m, s0 -; VBITS_GE_512-NEXT: st1w { z2.s }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov s1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.s, p0/m, s1 +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <16 x float>, ptr %a %r = insertelement <16 x float> %op1, float 5.0, i64 15 @@ -214,16 +214,16 @@ define void @insertelement_v16f32(ptr %a, ptr %b) #0 { define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <32 x float>, ptr %a %r = insertelement <32 x float> %op1, float 5.0, i64 31 @@ -234,16 +234,16 @@ define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov w8, #63 // =0x3f ; CHECK-NEXT: index z0.s, #0, #1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.s, p1/z, z0.s, z1.s -; CHECK-NEXT: fmov s0, #5.00000000 -; CHECK-NEXT: mov z2.s, p1/m, s0 -; CHECK-NEXT: st1w { z2.s }, p0, [x1] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: fmov s1, #5.00000000 +; CHECK-NEXT: mov z0.s, p0/m, s1 +; CHECK-NEXT: st1w { z0.s }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <64 x float>, ptr %a %r = insertelement <64 x float> %op1, float 5.0, i64 63 @@ -276,16 +276,16 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0 define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <4 x double>, ptr %a %r = insertelement <4 x double> %op1, double 5.0, i64 3 @@ -296,33 +296,33 @@ define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @insertelement_v8f64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: insertelement_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov w8, #3 // =0x3 ; VBITS_GE_256-NEXT: index z0.d, #0, #1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z1.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: fmov d2, #5.00000000 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: mov z3.d, p1/m, d2 -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] -; VBITS_GE_256-NEXT: st1d { z3.d }, p0, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_256-NEXT: fmov d0, #5.00000000 +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z1.d, p0/m, d0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: insertelement_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov w8, #7 // =0x7 ; VBITS_GE_512-NEXT: index z0.d, #0, #1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z1.d, x8 -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; VBITS_GE_512-NEXT: fmov d0, #5.00000000 -; VBITS_GE_512-NEXT: mov z2.d, p1/m, d0 -; VBITS_GE_512-NEXT: st1d { z2.d }, p0, [x1] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: fmov d1, #5.00000000 +; VBITS_GE_512-NEXT: mov z0.d, p0/m, d1 +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x1] ; VBITS_GE_512-NEXT: ret %op1 = load <8 x double>, ptr %a %r = insertelement <8 x double> %op1, double 5.0, i64 7 @@ -333,16 +333,16 @@ define void @insertelement_v8f64(ptr %a, ptr %b) #0 { define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: insertelement_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <16 x double>, ptr %a %r = insertelement <16 x double> %op1, double 5.0, i64 15 @@ -353,16 +353,16 @@ define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @insertelement_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: insertelement_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov w8, #31 // =0x1f ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0] -; CHECK-NEXT: cmpeq p1.d, p1/z, z0.d, z1.d -; CHECK-NEXT: fmov d0, #5.00000000 -; CHECK-NEXT: mov z2.d, p1/m, d0 -; CHECK-NEXT: st1d { z2.d }, p0, [x1] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: fmov d1, #5.00000000 +; CHECK-NEXT: mov z0.d, p0/m, d1 +; CHECK-NEXT: st1d { z0.d }, p1, [x1] ; CHECK-NEXT: ret %op1 = load <32 x double>, ptr %a %r = insertelement <32 x double> %op1, double 5.0, i64 31 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll index b45a62f4e7581..58fca3a2cf8b6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll @@ -1388,11 +1388,11 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 { ; CHECK-NEXT: mov x10, #64 // =0x40 ; CHECK-NEXT: mov x11, #80 // =0x50 ; CHECK-NEXT: mov x12, #32 // =0x20 -; CHECK-NEXT: mov x13, #48 // =0x30 -; CHECK-NEXT: mov x14, #16 // =0x10 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x10, lsl #1] +; CHECK-NEXT: mov x13, #48 // =0x30 +; CHECK-NEXT: mov x14, #16 // =0x10 ; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, x11, lsl #1] ; CHECK-NEXT: ld1h { z4.h }, p0/z, [x0, x12, lsl #1] ; CHECK-NEXT: ld1h { z5.h }, p0/z, [x0, x13, lsl #1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll index 11ed69513917c..0ddf434eff930 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll @@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 @@ -94,11 +94,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 ; VBITS_GE_128-NEXT: sshll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -203,7 +203,6 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: sdiv_v128i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.b, vl128 -; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] ; CHECK-NEXT: ld1b { z1.b }, p0/z, [x1] ; CHECK-NEXT: ptrue p0.s, vl64 @@ -221,7 +220,8 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h -; CHECK-NEXT: st1b { z1.h }, p1, [x0] +; CHECK-NEXT: ptrue p0.h, vl128 +; CHECK-NEXT: st1b { z1.h }, p0, [x0] ; CHECK-NEXT: ret %op1 = load <128 x i8>, ptr %a %op2 = load <128 x i8>, ptr %b @@ -260,14 +260,14 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] @@ -284,18 +284,18 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: sdiv_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sdivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_256-NEXT: mov w8, v1.s[1] ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b @@ -309,9 +309,9 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: sdiv_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sdivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_512-NEXT: mov w8, v1.s[1] ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b @@ -329,11 +329,11 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: sshll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h @@ -341,9 +341,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: sdiv_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -353,9 +353,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: sdiv_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -369,8 +369,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @sdiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 @@ -542,8 +542,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @sdiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdivr z0.s, p0/m, z0.s, z1.s @@ -664,8 +664,8 @@ define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @sdiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: sdiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: sdivr z0.d, p0/m, z0.d, z1.d @@ -748,9 +748,9 @@ define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 @@ -827,11 +827,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 ; VBITS_GE_128-NEXT: ushll v1.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v0.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -980,14 +980,14 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p1/m, z0.s, z1.s -; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p1, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z3.h, p1, z3.h, z0.h -; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b +; CHECK-NEXT: ptrue p1.b, vl128 ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z0.b, p1, z0.b, z1.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0] @@ -1004,18 +1004,18 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: xtn v0.4h, v0.4s ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: udiv_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: udivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_256-NEXT: mov w8, v1.s[1] ; VBITS_GE_256-NEXT: mov v0.16b, v1.16b @@ -1029,9 +1029,9 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: udiv_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: udivr z1.s, p0/m, z1.s, z0.s ; VBITS_GE_512-NEXT: mov w8, v1.s[1] ; VBITS_GE_512-NEXT: mov v0.16b, v1.16b @@ -1049,11 +1049,11 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: udiv_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 ; VBITS_GE_128-NEXT: ushll v1.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v0.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; VBITS_GE_128-NEXT: uzp1 v0.8h, v0.8h, v2.8h @@ -1061,9 +1061,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: udiv_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -1073,9 +1073,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: udiv_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -1089,8 +1089,8 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @udiv_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 @@ -1253,8 +1253,8 @@ define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @udiv_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udivr z0.s, p0/m, z0.s, z1.s @@ -1375,8 +1375,8 @@ define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @udiv_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: udiv_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q5, q4, [x1, #32] ; VBITS_GE_128-NEXT: udivr z0.d, p0/m, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll index 756e5f4cddf80..4feb86305f8f6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll @@ -58,8 +58,8 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 { define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -308,8 +308,8 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -472,8 +472,8 @@ define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: sext_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -554,8 +554,8 @@ define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v16i8_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -804,8 +804,8 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v8i16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -968,8 +968,8 @@ define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 { ; CHECK-LABEL: zext_v4i32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll index 38444d83c1d7e..2d78945399176 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll @@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu" define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: sshll v3.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -97,9 +97,9 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: sshll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: sshll v2.4s, v2.4h, #0 @@ -277,8 +277,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: ptrue p1.h, vl64 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h @@ -300,9 +300,9 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h @@ -310,9 +310,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: srem_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov w8, v2.s[1] ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b @@ -326,9 +326,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: srem_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sshll v2.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: sshll v3.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: mov w8, v2.s[1] ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b @@ -346,9 +346,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: srem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: sshll v4.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: sshll v3.4s, v1.4h, #0 @@ -359,11 +359,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: srem_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h @@ -372,11 +372,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: srem_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_512-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: sunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h @@ -389,8 +389,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @srem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: sshll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: sshll2 v3.4s, v0.8h, #0 @@ -582,25 +582,25 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @srem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: sdiv z4.s, p0/m, z4.s, z0.s +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: sdiv z7.s, p0/m, z7.s, z6.s ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: sdiv z18.s, p0/m, z18.s, z17.s -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: sdiv z19.s, p0/m, z19.s, z3.s -; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s -; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s +; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: stp q1, q2, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v16i32: @@ -730,26 +730,26 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @srem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: srem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: sdiv z4.d, p0/m, z4.d, z0.d +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: sdiv z7.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: sdiv z18.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: sdiv z19.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d ; VBITS_GE_128-NEXT: movprfx z1, z2 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: srem_v8i64: @@ -833,9 +833,9 @@ define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v8i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v2.8h, v1.8b, #0 ; VBITS_GE_128-NEXT: ushll v3.8h, v0.8b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -915,9 +915,9 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 { define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v16i8: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.8h, v1.16b, #0 ; VBITS_GE_128-NEXT: ushll2 v3.8h, v0.16b, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v4.4s, v2.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v5.4s, v3.8h, #0 ; VBITS_GE_128-NEXT: ushll v2.4s, v2.4h, #0 @@ -1095,8 +1095,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p1/m, z3.s, z5.s ; CHECK-NEXT: ptrue p1.h, vl64 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p1, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h @@ -1118,9 +1118,9 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v4i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_128-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: xtn v2.4h, v2.4s ; VBITS_GE_128-NEXT: mls v0.4h, v2.4h, v1.4h @@ -1128,9 +1128,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: urem_v4i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_256-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: mov w8, v2.s[1] ; VBITS_GE_256-NEXT: mov v3.16b, v2.16b @@ -1144,9 +1144,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: urem_v4i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: ushll v2.4s, v1.4h, #0 ; VBITS_GE_512-NEXT: ushll v3.4s, v0.4h, #0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl4 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: mov w8, v2.s[1] ; VBITS_GE_512-NEXT: mov v3.16b, v2.16b @@ -1164,9 +1164,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 { define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; VBITS_GE_128-LABEL: urem_v8i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ushll v4.4s, v0.4h, #0 ; VBITS_GE_128-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_128-NEXT: ushll v3.4s, v1.4h, #0 @@ -1177,11 +1177,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_256-LABEL: urem_v8i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_256-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_256-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: mls v0.8h, v2.8h, v1.8h @@ -1190,11 +1190,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { ; ; VBITS_GE_512-LABEL: urem_v8i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: // kill: def $q1 killed $q1 def $z1 ; VBITS_GE_512-NEXT: // kill: def $q0 killed $q0 def $z0 ; VBITS_GE_512-NEXT: uunpklo z2.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z3.s, z0.h +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 ; VBITS_GE_512-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; VBITS_GE_512-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_512-NEXT: mls v0.8h, v2.8h, v1.8h @@ -1207,8 +1207,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 { define void @urem_v16i16(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i16: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q4, q1, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldr q0, [x0, #16] ; VBITS_GE_128-NEXT: ushll2 v2.4s, v1.8h, #0 ; VBITS_GE_128-NEXT: ushll2 v3.4s, v0.8h, #0 @@ -1400,25 +1400,25 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @urem_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v16i32: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.s, vl4 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: udiv z4.s, p0/m, z4.s, z0.s +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: udiv z7.s, p0/m, z7.s, z6.s ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: udiv z18.s, p0/m, z18.s, z17.s -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: udiv z19.s, p0/m, z19.s, z3.s -; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s -; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: mls v1.4s, v4.4s, v0.4s ; VBITS_GE_128-NEXT: mls v2.4s, v19.4s, v3.4s -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls v16.4s, v18.4s, v17.4s +; VBITS_GE_128-NEXT: mls v5.4s, v7.4s, v6.4s ; VBITS_GE_128-NEXT: stp q1, q2, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v16i32: @@ -1548,26 +1548,26 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @urem_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_128-LABEL: urem_v8i64: ; VBITS_GE_128: // %bb.0: -; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q0, q3, [x1] +; VBITS_GE_128-NEXT: ptrue p0.d, vl2 ; VBITS_GE_128-NEXT: ldp q1, q2, [x0] ; VBITS_GE_128-NEXT: ldp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ldp q17, q6, [x1, #32] ; VBITS_GE_128-NEXT: movprfx z4, z1 ; VBITS_GE_128-NEXT: udiv z4.d, p0/m, z4.d, z0.d +; VBITS_GE_128-NEXT: movprfx z19, z2 +; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d ; VBITS_GE_128-NEXT: movprfx z7, z5 ; VBITS_GE_128-NEXT: udiv z7.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: movprfx z18, z16 ; VBITS_GE_128-NEXT: udiv z18.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: movprfx z19, z2 -; VBITS_GE_128-NEXT: udiv z19.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d -; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: msb z0.d, p0/m, z4.d, z1.d ; VBITS_GE_128-NEXT: movprfx z1, z2 ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z19.d, z3.d -; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] +; VBITS_GE_128-NEXT: mls z16.d, p0/m, z18.d, z17.d +; VBITS_GE_128-NEXT: mls z5.d, p0/m, z7.d, z6.d ; VBITS_GE_128-NEXT: stp q0, q1, [x0] +; VBITS_GE_128-NEXT: stp q16, q5, [x0, #32] ; VBITS_GE_128-NEXT: ret ; ; VBITS_GE_256-LABEL: urem_v8i64: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll index 710dce4de6dda..37396ba7011be 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll @@ -34,14 +34,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl32 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i8>, ptr %a %op2 = load volatile <32 x i8>, ptr %b @@ -53,31 +53,31 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov z0.b, w2 +; VBITS_GE_256-NEXT: ptrue p0.b ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 -; VBITS_GE_256-NEXT: ptrue p1.b -; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0, x8] -; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1, x8] -; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.b, p1, z1.b, z3.b -; VBITS_GE_256-NEXT: mov z0.b, p1/m, z2.b -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0, x8] -; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z2.b }, p1/z, [x1, x8] +; VBITS_GE_256-NEXT: ld1b { z3.b }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.b, p0, z0.b, z2.b +; VBITS_GE_256-NEXT: sel z1.b, p0, z1.b, z3.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p1, [x0, x8] +; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: mov z0.b, w2 -; VBITS_GE_512-NEXT: ptrue p1.b -; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z2.b }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; VBITS_GE_512-NEXT: sel z0.b, p1, z1.b, z2.b -; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p0.b +; VBITS_GE_512-NEXT: ptrue p1.b, vl64 +; VBITS_GE_512-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.b, p0, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <64 x i8>, ptr %a %op2 = load volatile <64 x i8>, ptr %b @@ -89,14 +89,14 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl128 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i8>, ptr %a %op2 = load volatile <128 x i8>, ptr %b @@ -108,14 +108,14 @@ define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v256i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: mov z0.b, w2 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0] -; CHECK-NEXT: ld1b { z2.b }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.b, p1/z, z0.b, #0 -; CHECK-NEXT: sel z0.b, p1, z1.b, z2.b -; CHECK-NEXT: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ptrue p1.b, vl256 +; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1] +; CHECK-NEXT: sel z0.b, p0, z0.b, z1.b +; CHECK-NEXT: st1b { z0.b }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <256 x i8>, ptr %a %op2 = load volatile <256 x i8>, ptr %b @@ -153,15 +153,15 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl16 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i16>, ptr %a %op2 = load volatile <16 x i16>, ptr %b @@ -173,33 +173,33 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w2 +; VBITS_GE_256-NEXT: ptrue p0.h ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.h +; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1, x8, lsl #1] -; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.h, p1, z1.h, z3.h -; VBITS_GE_256-NEXT: mov z0.h, p1/m, z2.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0, x8, lsl #1] -; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] +; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h +; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] +; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v32i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w2 -; VBITS_GE_512-NEXT: ptrue p1.h +; VBITS_GE_512-NEXT: ptrue p0.h +; VBITS_GE_512-NEXT: ptrue p1.h, vl32 ; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 -; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z2.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; VBITS_GE_512-NEXT: sel z0.h, p1, z1.h, z2.h -; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <32 x i16>, ptr %a %op2 = load volatile <32 x i16>, ptr %b @@ -211,15 +211,15 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl64 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i16>, ptr %a %op2 = load volatile <64 x i16>, ptr %b @@ -231,15 +231,15 @@ define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w2 -; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ptrue p1.h, vl128 ; CHECK-NEXT: and z0.h, z0.h, #0x1 -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] -; CHECK-NEXT: ld1h { z2.h }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.h, p1/z, z0.h, #0 -; CHECK-NEXT: sel z0.h, p1, z1.h, z2.h -; CHECK-NEXT: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] +; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] +; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h +; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <128 x i16>, ptr %a %op2 = load volatile <128 x i16>, ptr %b @@ -277,15 +277,15 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl8 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <8 x i32>, ptr %a %op2 = load volatile <8 x i32>, ptr %b @@ -297,33 +297,33 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: and w8, w2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.s +; VBITS_GE_256-NEXT: ptrue p0.s ; VBITS_GE_256-NEXT: mov z0.s, w8 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1, x8, lsl #2] -; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.s, p1, z1.s, z3.s -; VBITS_GE_256-NEXT: mov z0.s, p1/m, z2.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0, x8, lsl #2] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s +; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: and w8, w2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.s +; VBITS_GE_512-NEXT: ptrue p0.s ; VBITS_GE_512-NEXT: mov z0.s, w8 -; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z2.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; VBITS_GE_512-NEXT: sel z0.s, p1, z1.s, z2.s -; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.s, vl16 +; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <16 x i32>, ptr %a %op2 = load volatile <16 x i32>, ptr %b @@ -335,15 +335,15 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl32 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i32>, ptr %a %op2 = load volatile <32 x i32>, ptr %b @@ -355,15 +355,15 @@ define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: and w8, w2, #0x1 -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.s, p1/z, z0.s, #0 -; CHECK-NEXT: sel z0.s, p1, z1.s, z2.s -; CHECK-NEXT: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ptrue p1.s, vl64 +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] +; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <64 x i32>, ptr %a %op2 = load volatile <64 x i32>, ptr %b @@ -401,16 +401,16 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl4 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <4 x i64>, ptr %a %op2 = load volatile <4 x i64>, ptr %b @@ -422,35 +422,35 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { ; VBITS_GE_256-LABEL: select_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_256-NEXT: and x8, x2, #0x1 -; VBITS_GE_256-NEXT: ptrue p1.d +; VBITS_GE_256-NEXT: ptrue p0.d ; VBITS_GE_256-NEXT: mov z0.d, x8 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sel z1.d, p1, z1.d, z3.d -; VBITS_GE_256-NEXT: mov z0.d, p1/m, z2.d -; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d +; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d +; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] +; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: select_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 ; VBITS_GE_512-NEXT: and x8, x2, #0x1 -; VBITS_GE_512-NEXT: ptrue p1.d +; VBITS_GE_512-NEXT: ptrue p0.d ; VBITS_GE_512-NEXT: mov z0.d, x8 -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; VBITS_GE_512-NEXT: sel z0.d, p1, z1.d, z2.d -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ptrue p1.d, vl8 +; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] +; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] +; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] ; VBITS_GE_512-NEXT: ret %op1 = load volatile <8 x i64>, ptr %a %op2 = load volatile <8 x i64>, ptr %b @@ -462,16 +462,16 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 { define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { ; CHECK-LABEL: select_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl16 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <16 x i64>, ptr %a %op2 = load volatile <16 x i64>, ptr %b @@ -483,16 +483,16 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { ; CHECK-LABEL: select_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 -; CHECK-NEXT: sel z0.d, p1, z1.d, z2.d -; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ptrue p1.d, vl32 +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p1, [x0] ; CHECK-NEXT: ret %op1 = load volatile <32 x i64>, ptr %a %op2 = load volatile <32 x i64>, ptr %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 50040eaa61e6c..5bb012ae57503 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -131,8 +131,8 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 { define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v8i16_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -354,7 +354,6 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ucvtf z0.h, p0/m, z0.s @@ -363,7 +362,8 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16: @@ -535,8 +535,8 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 { define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v4i32_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -759,7 +759,6 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ucvtf z0.s, p0/m, z0.d @@ -768,7 +767,8 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32: @@ -1038,8 +1038,8 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 { define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v8i16_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -1273,7 +1273,6 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: scvtf z0.h, p0/m, z0.s @@ -1282,7 +1281,8 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16: @@ -1454,8 +1454,8 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 { define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v4i32_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x1] @@ -1684,7 +1684,6 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: scvtf z0.s, p0/m, z0.d @@ -1693,7 +1692,8 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll index e23151475014d..f2ad98f8caec9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -246,16 +246,16 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1w { z0.d }, p0/z, [z0.d] ; VBITS_GE_256-NEXT: ld1w { z1.d }, p0/z, [z1.d] -; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_gather_v8i32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll index 67a53d4e15f3b..55d37d1bda5e4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) { ; CHECK-LABEL: masked_load_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll index bdd6ce0647016..1a19b77f53c67 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll @@ -11,12 +11,12 @@ target triple = "aarch64-unknown-linux-gnu" define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) { ; CHECK-LABEL: masked_store_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: st1b { z1.b }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask) ret void @@ -27,11 +27,11 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: ptrue p0.h, vl8 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.8h, v0.8h, #15 ; CHECK-NEXT: cmlt v0.8h, v0.8h, #0 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 -; CHECK-NEXT: st1h { z1.h }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) ret void @@ -42,11 +42,11 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: st1w { z1.s }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask) ret void @@ -57,11 +57,11 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) { ; CHECK: // %bb.0: ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-NEXT: ptrue p0.d, vl2 -; CHECK-NEXT: movi v1.2d, #0000000000000000 ; CHECK-NEXT: shl v0.2d, v0.2d, #63 ; CHECK-NEXT: cmlt v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z1.d }, p0, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask) ret void diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index 92fce4584f6a9..27e95489f8ad7 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -41,11 +41,11 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1b { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <4 x i8>, ptr %a @@ -65,7 +65,6 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b ; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 ; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 @@ -75,10 +74,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1b { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -93,11 +93,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b @@ -118,11 +118,11 @@ define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1b { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b @@ -194,10 +194,10 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: str d0, [x0] @@ -219,15 +219,15 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -242,10 +242,10 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: str q0, [x0] @@ -332,9 +332,9 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -354,18 +354,18 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p2.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: punpklo p3.h, p1.b ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p1.h, p1.b +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: and p1.b, p3/z, p3.b, p2.b -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] @@ -460,8 +460,8 @@ define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] @@ -481,9 +481,9 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <4 x i64>, ptr %a @@ -500,13 +500,13 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] +; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z2.d] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] ; VBITS_GE_256-NEXT: ret @@ -515,9 +515,9 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512: // %bb.0: ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret %cval = load <8 x i64>, ptr %a @@ -533,9 +533,9 @@ define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <16 x i64>, ptr %a @@ -551,9 +551,9 @@ define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1d { z0.d }, p1/z, [z1.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %cval = load <32 x i64>, ptr %a @@ -603,10 +603,10 @@ define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: str d0, [x0] @@ -626,17 +626,17 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 -; VBITS_GE_256-NEXT: sunpklo z2.s, z0.h +; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 -; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1h { z0.d }, p0/z, [z2.d] +; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -651,10 +651,10 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: fcmeq v0.8h, v0.8h, #0.0 ; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_512-NEXT: str q0, [x0] @@ -741,9 +741,9 @@ define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 ; CHECK-NEXT: sunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -763,18 +763,18 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: ptrue p2.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_256-NEXT: punpklo p3.h, p1.b ; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p1.h, p1.b +; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: and p1.b, p3/z, p3.b, p2.b -; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] +; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 -; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z2.d] -; VBITS_GE_256-NEXT: ptrue p1.s, vl4 +; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] +; VBITS_GE_256-NEXT: ptrue p1.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s ; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] @@ -869,8 +869,8 @@ define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_gather_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x1] @@ -1202,8 +1202,8 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) # ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] -; CHECK-NEXT: punpklo p3.h, p1.b -; CHECK-NEXT: ld1w { z0.d }, p3/z, [z0.d] +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll index 467378e7da59b..c22d9e71c51a9 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll @@ -35,9 +35,9 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_load_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -53,9 +53,9 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { ; CHECK-LABEL: masked_load_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] @@ -401,8 +401,8 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -436,8 +436,8 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -504,8 +504,8 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 @@ -603,8 +603,8 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -638,8 +638,8 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: ldr d0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 @@ -706,8 +706,8 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: ldr q0, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 @@ -782,11 +782,11 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.b, vl16 -; VBITS_GE_256-NEXT: ptrue p2.b, vl32 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b -; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1000,11 +1000,11 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ptrue p2.s, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1041,11 +1041,11 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.b, vl16 -; VBITS_GE_256-NEXT: ptrue p2.b, vl32 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b -; VBITS_GE_256-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; VBITS_GE_256-NEXT: ptrue p1.b, vl32 +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] ; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 @@ -1259,11 +1259,11 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { ; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p1.s, vl4 -; VBITS_GE_256-NEXT: ptrue p2.s, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s -; VBITS_GE_256-NEXT: cmpne p1.s, p2/z, z1.s, #0 +; VBITS_GE_256-NEXT: ptrue p1.s, vl8 +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] ; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index e2d341c22efc2..e3e06dcdf17f3 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -39,12 +39,12 @@ define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v1.4h, v0.4h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i8>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -61,11 +61,11 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0 ; VBITS_GE_256-NEXT: zip1 v3.8b, v0.8b, v0.8b +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b ; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b ; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b ; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 ; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h @@ -78,10 +78,10 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z4.d] -; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i8: @@ -92,12 +92,12 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1b { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i8>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -115,12 +115,12 @@ define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: sunpklo z1.h, z1.b ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i8>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -135,13 +135,13 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: ptrue p1.d, vl32 ; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] +; CHECK-NEXT: uunpklo z1.h, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 -; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z1.h ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -187,10 +187,10 @@ define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i16>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -208,20 +208,20 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8i16: @@ -232,10 +232,10 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i16>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -253,9 +253,9 @@ define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i16>, ptr %a @@ -274,9 +274,9 @@ define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i16>, ptr %a @@ -317,9 +317,9 @@ define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: cmeq v1.4s, v0.4s, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i32>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p0.h, p0.b +; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z3.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; @@ -434,8 +434,8 @@ define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ldr q1, [x1] @@ -454,8 +454,8 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x i64>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -484,8 +484,8 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [z1.d] +; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x i64>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -500,8 +500,8 @@ define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x i64>, ptr %a %ptrs = load <16 x ptr>, ptr %b @@ -516,8 +516,8 @@ define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] -; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 +; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x i64>, ptr %a %ptrs = load <32 x ptr>, ptr %b @@ -539,15 +539,15 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-NEXT: uunpklo z1.d, z1.s ; CHECK-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NEXT: mov w8, v2.s[1] ; CHECK-NEXT: mov v0.h[1], w8 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: ldr q0, [x1] -; CHECK-NEXT: st1h { z1.d }, p0, [z0.d] +; CHECK-NEXT: uunpklo z0.d, z1.s +; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <2 x half>, ptr %a %ptrs = load <2 x ptr>, ptr %b @@ -565,10 +565,10 @@ define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x half>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -586,20 +586,20 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 ; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h ; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h ; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 -; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] -; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z4.d] +; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] +; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_scatter_v8f16: @@ -610,10 +610,10 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z2.d] +; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x half>, ptr %a %ptrs = load <8 x ptr>, ptr %b @@ -631,9 +631,9 @@ define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <16 x half>, ptr %a @@ -652,9 +652,9 @@ define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret %vals = load <32 x half>, ptr %a @@ -695,9 +695,9 @@ define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: sunpklo z1.d, z1.s -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x1] -; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] +; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] +; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] ; CHECK-NEXT: ret %vals = load <4 x float>, ptr %a %ptrs = load <4 x ptr>, ptr %b @@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 +; VBITS_GE_256-NEXT: ptrue p1.d, vl4 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 +; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] +; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s ; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_GE_256-NEXT: mov z1.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: punpklo p2.h, p1.b ; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] -; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] +; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: punpklo p0.h, p0.b +; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b ; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 +; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] ; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s -; VBITS_GE_256-NEXT: and p1.b, p2/z, p2.b, p0.b -; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 -; VBITS_GE_256-NEXT: st1w { z2.d }, p1, [z3.d] +; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] ; VBITS_GE_256-NEXT: ret ; @@ -812,8 +812,8 @@ define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 { define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_scatter_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: ldr q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll index 68fb4cc6afb09..b0d4f79aea110 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll @@ -34,9 +34,9 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { ; CHECK-LABEL: masked_store_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldr d0, [x0] ; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: fcmeq v1.2s, v0.2s, v1.2s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -51,9 +51,9 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { ; CHECK-LABEL: masked_store_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcmeq v1.4s, v0.4s, v1.4s ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] @@ -161,11 +161,11 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s -; VBITS_GE_256-NEXT: cmpne p0.s, p1/z, z3.s, #0 -; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 +; VBITS_GE_256-NEXT: st1b { z1.s }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8: @@ -197,14 +197,14 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z1.d, z3.d ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: mov z2.d, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 -; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h -; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: uzp1 z2.h, z3.h, z3.h @@ -246,11 +246,11 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.d, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z2.s, z2.s, z2.s -; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_GE_256-NEXT: uzp1 z3.s, z3.s, z3.s ; VBITS_GE_256-NEXT: splice z3.s, p0, z3.s, z2.s -; VBITS_GE_256-NEXT: cmpne p0.s, p1/z, z3.s, #0 -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z3.s, #0 +; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32: @@ -282,14 +282,14 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff +; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 -; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b -; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h -; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b ; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] @@ -327,17 +327,17 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z1.s, z3.s ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: mov z2.s, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 +; VBITS_GE_256-NEXT: ptrue p1.h, vl8 ; VBITS_GE_256-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_256-NEXT: ptrue p0.h, vl8 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z2.h, z2.h, z2.h ; VBITS_GE_256-NEXT: uzp1 z3.h, z3.h, z3.h ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b -; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: mov v3.d[1], v2.d[0] ; VBITS_GE_256-NEXT: sunpklo z2.h, z3.b -; VBITS_GE_256-NEXT: cmpne p0.h, p1/z, z2.h, #0 +; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z2.h, #0 ; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] ; VBITS_GE_256-NEXT: ret ; @@ -375,11 +375,11 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 { ; VBITS_GE_256-NEXT: mov z3.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b -; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b +; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b ; VBITS_GE_256-NEXT: splice z3.b, p0, z3.b, z2.b -; VBITS_GE_256-NEXT: cmpne p0.b, p1/z, z3.b, #0 -; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] +; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z3.b, #0 +; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x2] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll index a5303c901b80f..fb169491b0c90 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll @@ -64,7 +64,6 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: mov x8, #16 // =0x10 ; CHECK-NEXT: mov x10, #8 // =0x8 -; CHECK-NEXT: ld1w { z4.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: mov v2.b[7], w11 ; CHECK-NEXT: mov v1.b[7], w9 ; CHECK-NEXT: uunpklo z3.h, z3.b @@ -86,22 +85,23 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang ; CHECK-NEXT: asr z2.s, z2.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z0.s, #0 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NEXT: cmpne p2.s, p0/z, z3.s, #0 -; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: and z2.s, z2.s, #0x1 ; CHECK-NEXT: and z1.s, z1.s, #0x1 -; CHECK-NEXT: mov z4.s, p1/m, #0 // =0x0 -; CHECK-NEXT: mov z0.s, p2/m, #0 // =0x0 +; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: cmpne p3.s, p0/z, z2.s, #0 -; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0] -; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2] -; CHECK-NEXT: st1w { z0.s }, p0, [x0, x9, lsl #2] -; CHECK-NEXT: mov z3.s, p3/m, #0 // =0x0 -; CHECK-NEXT: mov z2.s, p1/m, #0 // =0x0 -; CHECK-NEXT: st1w { z3.s }, p0, [x0, x10, lsl #2] -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: cmpne p4.s, p0/z, z1.s, #0 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x10, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: mov z3.s, p2/m, #0 // =0x0 +; CHECK-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] +; CHECK-NEXT: mov z2.s, p3/m, #0 // =0x0 +; CHECK-NEXT: mov z1.s, p4/m, #0 // =0x0 +; CHECK-NEXT: st1w { z3.s }, p0, [x0, x9, lsl #2] +; CHECK-NEXT: st1w { z2.s }, p0, [x0, x10, lsl #2] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: .LBB1_2: // %exit ; CHECK-NEXT: ret %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll index f97ca05f3bdd4..b633057be139c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll @@ -34,8 +34,8 @@ define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 { define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl32 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i8> undef, i8 %a, i64 0 @@ -47,8 +47,8 @@ define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v64i8(i8 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v64i8: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov z0.b, w0 +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 ; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1, x8] ; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] @@ -56,8 +56,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v64i8: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: mov z0.b, w0 +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 ; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <64 x i8> undef, i8 %a, i64 0 @@ -69,8 +69,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 { define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <128 x i8> undef, i8 %a, i64 0 @@ -82,8 +82,8 @@ define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v256i8(i8 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v256i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: mov z0.b, w0 +; CHECK-NEXT: ptrue p0.b, vl256 ; CHECK-NEXT: st1b { z0.b }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <256 x i8> undef, i8 %a, i64 0 @@ -117,8 +117,8 @@ define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 { define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <16 x i16> undef, i16 %a, i64 0 @@ -130,8 +130,8 @@ define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v32i16(i16 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32i16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov z0.h, w0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] @@ -139,8 +139,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v32i16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, w0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <32 x i16> undef, i16 %a, i64 0 @@ -152,8 +152,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 { define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <64 x i16> undef, i16 %a, i64 0 @@ -165,8 +165,8 @@ define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v128i16(i16 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v128i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, w0 +; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: st1h { z0.h }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <128 x i16> undef, i16 %a, i64 0 @@ -200,8 +200,8 @@ define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 { define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <8 x i32> undef, i32 %a, i64 0 @@ -213,8 +213,8 @@ define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v16i32(i32 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16i32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov z0.s, w0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] @@ -222,8 +222,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v16i32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, w0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <16 x i32> undef, i32 %a, i64 0 @@ -235,8 +235,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 { define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i32> undef, i32 %a, i64 0 @@ -248,8 +248,8 @@ define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v64i32(i32 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v64i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov z0.s, w0 +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: st1w { z0.s }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <64 x i32> undef, i32 %a, i64 0 @@ -283,8 +283,8 @@ define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 { define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <4 x i64> undef, i64 %a, i64 0 @@ -296,8 +296,8 @@ define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v8i64(i64 %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8i64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov z0.d, x0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] @@ -305,8 +305,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v8i64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov z0.d, x0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] ; VBITS_GE_512-NEXT: ret %insert = insertelement <8 x i64> undef, i64 %a, i64 0 @@ -318,8 +318,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 { define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v16i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <16 x i64> undef, i64 %a, i64 0 @@ -331,8 +331,8 @@ define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v32i64(i64 %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v32i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov z0.d, x0 +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %insert = insertelement <32 x i64> undef, i64 %a, i64 0 @@ -372,8 +372,8 @@ define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 { define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl16 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -386,8 +386,8 @@ define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v32f16(half %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v32f16: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 ; VBITS_GE_256-NEXT: mov z0.h, h0 ; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] @@ -396,8 +396,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v32f16: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: // kill: def $h0 killed $h0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 ; VBITS_GE_512-NEXT: mov z0.h, h0 ; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -410,8 +410,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 { define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v64f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -424,8 +424,8 @@ define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v128f16(half %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v128f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl128 ; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret @@ -462,8 +462,8 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0 define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl8 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -476,8 +476,8 @@ define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v16f32(float %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v16f32: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: mov z0.s, s0 ; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] @@ -486,8 +486,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v16f32: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: // kill: def $s0 killed $s0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 ; VBITS_GE_512-NEXT: mov z0.s, s0 ; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -500,8 +500,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 { define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v32f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -514,8 +514,8 @@ define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v64f32(float %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v64f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl64 ; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret @@ -550,8 +550,8 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0) define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 { ; CHECK-LABEL: splat_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -564,8 +564,8 @@ define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 { define void @splat_v8f64(double %a, ptr %b) #0 { ; VBITS_GE_256-LABEL: splat_v8f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 ; VBITS_GE_256-NEXT: mov z0.d, d0 ; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] @@ -574,8 +574,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 { ; ; VBITS_GE_512-LABEL: splat_v8f64: ; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: // kill: def $d0 killed $d0 def $z0 +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: mov z0.d, d0 ; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] ; VBITS_GE_512-NEXT: ret @@ -588,8 +588,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 { define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 { ; CHECK-LABEL: splat_v16f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl16 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -602,8 +602,8 @@ define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 { define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 { ; CHECK-LABEL: splat_v32f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl32 ; CHECK-NEXT: mov z0.d, d0 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret @@ -620,8 +620,8 @@ define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 { define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: mov z0.b, #1 // =0x1 +; CHECK-NEXT: ptrue p0.b, vl64 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <64 x i8> undef, i8 1, i64 0 @@ -633,8 +633,8 @@ define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: mov z0.h, #2 // =0x2 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <32 x i16> undef, i16 2, i64 0 @@ -646,8 +646,8 @@ define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v16i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: mov z0.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <16 x i32> undef, i32 3, i64 0 @@ -659,8 +659,8 @@ define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: mov z0.d, #4 // =0x4 +; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <8 x i64> undef, i64 4, i64 0 @@ -676,8 +676,8 @@ define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v32f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: fmov z0.h, #5.00000000 +; CHECK-NEXT: ptrue p0.h, vl32 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <32 x half> undef, half 5.0, i64 0 @@ -689,8 +689,8 @@ define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v16f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: fmov z0.s, #6.00000000 +; CHECK-NEXT: ptrue p0.s, vl16 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <16 x float> undef, float 6.0, i64 0 @@ -702,8 +702,8 @@ define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 { define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 { ; CHECK-LABEL: splat_imm_v8f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: fmov z0.d, #7.00000000 +; CHECK-NEXT: ptrue p0.d, vl8 ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret %insert = insertelement <8 x double> undef, double 7.0, i64 0 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll index 2dc4bddb81a6d..020d5cb53bf21 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll @@ -36,14 +36,14 @@ define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1b { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1b { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v8i64i8: @@ -117,14 +117,14 @@ define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.d, vl4 ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 -; VBITS_GE_256-NEXT: ptrue p1.s, vl8 ; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl4 ; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s ; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s ; VBITS_GE_256-NEXT: splice z1.s, p0, z1.s, z0.s -; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v8i64i32: @@ -172,14 +172,14 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 -; VBITS_GE_256-NEXT: ptrue p1.h, vl16 ; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.h, vl8 ; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h ; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h ; VBITS_GE_256-NEXT: splice z1.h, p0, z1.h, z0.h -; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v16i32i16: @@ -199,14 +199,14 @@ define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 { ; VBITS_GE_256: // %bb.0: ; VBITS_GE_256-NEXT: ptrue p0.h, vl16 ; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 -; VBITS_GE_256-NEXT: ptrue p1.b, vl32 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.b, vl16 ; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b ; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b ; VBITS_GE_256-NEXT: splice z1.b, p0, z1.b, z0.b -; VBITS_GE_256-NEXT: st1b { z1.b }, p1, [x1] +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x1] ; VBITS_GE_256-NEXT: ret ; ; VBITS_GE_512-LABEL: store_trunc_v32i16i8: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll index 68c234a20d110..28094c7b68e7c 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll @@ -559,13 +559,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: ; SVE2_128_NOMAX: // %bb.0: -; SVE2_128_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_128_NOMAX-NEXT: cnth x8 ; SVE2_128_NOMAX-NEXT: adrp x9, .LCPI7_0 ; SVE2_128_NOMAX-NEXT: adrp x10, .LCPI7_1 ; SVE2_128_NOMAX-NEXT: mov z0.h, w8 ; SVE2_128_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] ; SVE2_128_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_128_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_128_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h ; SVE2_128_NOMAX-NEXT: ldr q1, [x0] ; SVE2_128_NOMAX-NEXT: ldr q2, [x1] @@ -575,13 +575,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16: ; SVE2_NOMIN_NOMAX: // %bb.0: -; SVE2_NOMIN_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_NOMIN_NOMAX-NEXT: cnth x8 ; SVE2_NOMIN_NOMAX-NEXT: adrp x9, .LCPI7_0 ; SVE2_NOMIN_NOMAX-NEXT: adrp x10, .LCPI7_1 ; SVE2_NOMIN_NOMAX-NEXT: mov z0.h, w8 ; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x9, :lo12:.LCPI7_0] ; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x10, :lo12:.LCPI7_1] +; SVE2_NOMIN_NOMAX-NEXT: ptrue p0.h, vl8 ; SVE2_NOMIN_NOMAX-NEXT: mad z0.h, p0/m, z1.h, z2.h ; SVE2_NOMIN_NOMAX-NEXT: ldr q1, [x0] ; SVE2_NOMIN_NOMAX-NEXT: ldr q2, [x1] @@ -597,9 +597,9 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) { ; SVE2_MIN_256_NOMAX-NEXT: adrp x9, .LCPI7_1 ; SVE2_MIN_256_NOMAX-NEXT: add x9, x9, :lo12:.LCPI7_1 ; SVE2_MIN_256_NOMAX-NEXT: cnth x10 -; SVE2_MIN_256_NOMAX-NEXT: mov z2.h, w10 ; SVE2_MIN_256_NOMAX-NEXT: ld1h { z0.h }, p0/z, [x8] ; SVE2_MIN_256_NOMAX-NEXT: ld1h { z1.h }, p0/z, [x9] +; SVE2_MIN_256_NOMAX-NEXT: mov z2.h, w10 ; SVE2_MIN_256_NOMAX-NEXT: mad z0.h, p0/m, z2.h, z1.h ; SVE2_MIN_256_NOMAX-NEXT: ldr q1, [x0] ; SVE2_MIN_256_NOMAX-NEXT: ldr q2, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll index 5ff9f0f0df62f..afe13851f0b95 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll @@ -7,11 +7,11 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, %i37, < ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #3745 // =0xea1 ; CHECK-NEXT: movk w8, #16618, lsl #16 +; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: mov w8, #57344 // =0xe000 ; CHECK-NEXT: movk w8, #17535, lsl #16 ; CHECK-NEXT: mov z5.s, w8 -; CHECK-NEXT: ld1w { z3.d }, p0/z, [x0] ; CHECK-NEXT: fmul z4.s, p0/m, z4.s, z3.s ; CHECK-NEXT: fadd z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: mov z5.d, #1023 // =0x3ff diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll index aefc8de431436..6420071b3dce4 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll @@ -92,8 +92,8 @@ define @fsqrt_recip_8f16( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.h, z0.h ; CHECK-NEXT: ptrue p0.h -; CHECK-NEXT: fmul z2.h, z1.h, z1.h ; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fmul z2.h, z1.h, z1.h ; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h ; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h @@ -117,8 +117,8 @@ define @fsqrt_recip_4f32( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.s, z0.s ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: fmul z2.s, z1.s, z1.s ; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fmul z2.s, z1.s, z1.s ; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s ; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: fmul z2.s, z1.s, z1.s @@ -145,8 +145,8 @@ define @fsqrt_recip_2f64( %a) #0 { ; CHECK: // %bb.0: ; CHECK-NEXT: frsqrte z1.d, z0.d ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: fmul z2.d, z1.d, z1.d ; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: fmul z2.d, z1.d, z1.d ; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d ; CHECK-NEXT: fmul z1.d, z1.d, z2.d ; CHECK-NEXT: fmul z2.d, z1.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll index 460d8a8694bc4..1a2ab8d4253ab 100644 --- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll +++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll @@ -51,10 +51,10 @@ define half @fadda_nxv6f16( %v, half %s) { ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #32768 // =0x8000 -; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: fmov s0, s1 ; CHECK-NEXT: st1h { z2.d }, p1, [sp, #3, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll index 813f1601e809e..584c29ebcfc04 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll @@ -14,8 +14,8 @@ declare @llvm.fptosi.sat.nxv4f32.nxv4i64( define @test_signed_v2f32_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff @@ -38,8 +38,8 @@ define @test_signed_v2f32_v2i32( %f) { define @test_signed_v4f32_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff @@ -67,29 +67,29 @@ define @test_signed_v8f32_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-822083584 // =0xcf000000 -; CHECK-NEXT: mov z3.s, #0x80000000 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: mov z6.s, #0x7fffffff ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #1325400063 // =0x4effffff -; CHECK-NEXT: mov z6.s, #0x7fffffff -; CHECK-NEXT: mov z4.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z0.s +; CHECK-NEXT: mov z3.s, w8 +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzs z5.s, p0/m, z1.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z4.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z4.s +; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, z2.s +; CHECK-NEXT: fcmge p2.s, p0/z, z1.s, z2.s +; CHECK-NEXT: mov z2.s, #0x80000000 +; CHECK-NEXT: fcmgt p3.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z1.s, z3.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, z3.s +; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s ; CHECK-NEXT: fcmuo p1.s, p0/z, z0.s, z0.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z1.s, z1.s -; CHECK-NEXT: sel z3.s, p2, z3.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z2.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z3.s +; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s +; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s +; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -103,8 +103,8 @@ define @test_signed_v8f32_v8i32( %f) { define @test_signed_v4f32_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 @@ -132,28 +132,28 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #-956301312 // =0xc7000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z5.s, #32767 // =0x7fff ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: mov w8, #65024 // =0xfe00 ; CHECK-NEXT: movk w8, #18175, lsl #16 -; CHECK-NEXT: mov z3.s, w8 -; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s -; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z1.s +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzs z3.s, p0/m, z1.s ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.s, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s +; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s +; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z2.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z2.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z3.s, p1/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s ; CHECK-NEXT: mov z4.s, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.s, p3, z5.s, z2.s +; CHECK-NEXT: sel z0.s, p3, z5.s, z3.s ; CHECK-NEXT: sel z1.s, p4, z5.s, z4.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 @@ -169,8 +169,8 @@ define @test_signed_v8f32_v8i16( %f) { define @test_signed_v2f32_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff @@ -198,31 +198,31 @@ define @test_signed_v4f32_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #-553648128 // =0xdf000000 ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov z2.s, w8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1593835519 // =0x5effffff -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: mov z4.s, w8 +; CHECK-NEXT: mov z3.s, w8 ; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, z2.s ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, z2.s -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.s +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.s ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.s -; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z4.s -; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z4.s +; CHECK-NEXT: fcmgt p3.s, p0/z, z1.s, z3.s +; CHECK-NEXT: fcmgt p4.s, p0/z, z0.s, z3.s ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s ; CHECK-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -246,8 +246,8 @@ declare @llvm.fptosi.sat.nxv4f64.nxv4i64( @test_signed_v2f64_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 @@ -276,30 +276,30 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 -; CHECK-NEXT: mov z3.d, #0xffffffff80000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z6.d, #0x7fffffff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 -; CHECK-NEXT: mov z6.d, #0x7fffffff ; CHECK-NEXT: movk x8, #16863, lsl #48 -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.d +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z4.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z2.d, #0xffffffff80000000 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -322,48 +322,48 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4476578029606273024 // =0xc1e0000000000000 -; CHECK-NEXT: mov z26.d, #0x7fffffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z5.d, #0xffffffff80000000 ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281474972516352 // =0xffffffc00000 +; CHECK-NEXT: mov z26.d, #0x7fffffff ; CHECK-NEXT: movk x8, #16863, lsl #48 -; CHECK-NEXT: mov z5.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d -; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d -; CHECK-NEXT: mov z4.d, #0xffffffff80000000 -; CHECK-NEXT: movprfx z6, z1 -; CHECK-NEXT: fcvtzs z6.d, p0/m, z1.d ; CHECK-NEXT: movprfx z7, z0 ; CHECK-NEXT: fcvtzs z7.d, p0/m, z0.d ; CHECK-NEXT: movprfx z24, z3 ; CHECK-NEXT: fcvtzs z24.d, p0/m, z3.d +; CHECK-NEXT: mov z6.d, x8 ; CHECK-NEXT: movprfx z25, z2 ; CHECK-NEXT: fcvtzs z25.d, p0/m, z2.d -; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z5.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z5.d -; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z5.d +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z4.d +; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, z4.d +; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, z4.d +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z6.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z6.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z3.d, z6.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z4.d, p1/m, z5.d +; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z6.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z6.d, p1/m, z4.d -; CHECK-NEXT: fcmgt p1.d, p0/z, z2.d, z5.d -; CHECK-NEXT: sel z5.d, p2, z4.d, z7.d +; CHECK-NEXT: sel z6.d, p2, z5.d, z7.d ; CHECK-NEXT: fcmuo p2.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z7.d, p3, z4.d, z24.d +; CHECK-NEXT: sel z7.d, p3, z5.d, z24.d ; CHECK-NEXT: fcmuo p3.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z4.d, p4, z4.d, z25.d +; CHECK-NEXT: sel z5.d, p4, z5.d, z25.d ; CHECK-NEXT: fcmuo p4.d, p0/z, z3.d, z3.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z2.d, z2.d -; CHECK-NEXT: sel z0.d, p5, z26.d, z6.d -; CHECK-NEXT: sel z1.d, p6, z26.d, z5.d +; CHECK-NEXT: sel z0.d, p5, z26.d, z4.d +; CHECK-NEXT: sel z1.d, p6, z26.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: sel z2.d, p7, z26.d, z7.d ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: sel z3.d, p1, z26.d, z4.d +; CHECK-NEXT: sel z3.d, p1, z26.d, z5.d ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p3/m, #0 // =0x0 @@ -387,28 +387,28 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z5.d, #32767 // =0x7fff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 -; CHECK-NEXT: mov z3.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.d +; CHECK-NEXT: movprfx z3, z1 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 ; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z3.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z3.d +; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, z2.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: fcmgt p3.d, p0/z, z1.d, z2.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z0.d, z2.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: mov z3.d, p1/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p1.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d ; CHECK-NEXT: mov z4.d, p2/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: sel z0.d, p3, z5.d, z2.d +; CHECK-NEXT: sel z0.d, p3, z5.d, z3.d ; CHECK-NEXT: sel z1.d, p4, z5.d, z4.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 @@ -432,34 +432,34 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4548635623644200960 // =0xc0e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z25.d, #32767 // =0x7fff ; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: mov x8, #281200098803712 // =0xffc000000000 ; CHECK-NEXT: movk x8, #16607, lsl #48 -; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d -; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d -; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d -; CHECK-NEXT: movprfx z5, z3 -; CHECK-NEXT: fcvtzs z5.d, p0/m, z3.d -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: fcvtzs z6.d, p0/m, z2.d ; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: fcvtzs z7.d, p0/m, z1.d +; CHECK-NEXT: mov z5.d, x8 ; CHECK-NEXT: movprfx z24, z0 ; CHECK-NEXT: fcvtzs z24.d, p0/m, z0.d +; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, z4.d +; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, z4.d +; CHECK-NEXT: movprfx z4, z3 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z3.d +; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z5.d +; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z5.d +; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z5.d ; CHECK-NEXT: not p1.b, p0/z, p1.b -; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z4.d -; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z4.d ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: fcmgt p7.d, p0/z, z1.d, z4.d ; CHECK-NEXT: not p3.b, p0/z, p3.b +; CHECK-NEXT: mov z4.d, p1/m, #-32768 // =0xffffffffffff8000 +; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z5.d ; CHECK-NEXT: not p4.b, p0/z, p4.b -; CHECK-NEXT: mov z5.d, p1/m, #-32768 // =0xffffffffffff8000 -; CHECK-NEXT: fcmgt p1.d, p0/z, z0.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p2.d, p0/z, z3.d, z3.d ; CHECK-NEXT: mov z7.d, p3/m, #-32768 // =0xffffffffffff8000 @@ -467,7 +467,7 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: mov z24.d, p4/m, #-32768 // =0xffffffffffff8000 ; CHECK-NEXT: fcmuo p4.d, p0/z, z1.d, z1.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z0.d, z0.d -; CHECK-NEXT: sel z2.d, p5, z25.d, z5.d +; CHECK-NEXT: sel z2.d, p5, z25.d, z4.d ; CHECK-NEXT: sel z0.d, p6, z25.d, z6.d ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: sel z1.d, p7, z25.d, z7.d @@ -492,8 +492,8 @@ define @test_signed_v8f64_v8i16( %f) { define @test_signed_v2f64_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f64_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff @@ -521,29 +521,29 @@ define @test_signed_v4f64_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #-4332462841530417152 // =0xc3e0000000000000 -; CHECK-NEXT: mov z3.d, #0x8000000000000000 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov x8, #4890909195324358655 // =0x43dfffffffffffff -; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff -; CHECK-NEXT: mov z4.d, x8 -; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d -; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d -; CHECK-NEXT: movprfx z2, z0 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: movprfx z4, z0 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z0.d ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z1.d -; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z4.d -; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z4.d +; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, z2.d +; CHECK-NEXT: fcmge p2.d, p0/z, z1.d, z2.d +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: fcmgt p3.d, p0/z, z0.d, z3.d +; CHECK-NEXT: fcmgt p4.d, p0/z, z1.d, z3.d ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.d, p0/z, z0.d, z0.d ; CHECK-NEXT: fcmuo p0.d, p0/z, z1.d, z1.d -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 @@ -568,8 +568,8 @@ declare @llvm.fptosi.sat.nxv4f16.nxv4i64() define @test_signed_v2f16_v2i32( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -592,8 +592,8 @@ define @test_signed_v2f16_v2i32( %f) { define @test_signed_v4f16_v4i32( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x80000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -621,31 +621,31 @@ define @test_signed_v8f16_v8i32( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z0.s, z0.h ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.s, #0x80000000 -; CHECK-NEXT: mov z4.h, w8 +; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: mov z6.s, #0x7fffffff ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.s, p0/m, z1.h +; CHECK-NEXT: mov z2.s, #0x80000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.s, p0/m, z1.h ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.s, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z4.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.s, p1/m, z3.s +; CHECK-NEXT: sel z3.s, p1, z2.s, z4.s ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z3.s, p2, z3.s, z5.s -; CHECK-NEXT: sel z0.s, p3, z6.s, z2.s -; CHECK-NEXT: sel z1.s, p4, z6.s, z3.s +; CHECK-NEXT: sel z2.s, p2, z2.s, z5.s +; CHECK-NEXT: sel z0.s, p3, z6.s, z3.s +; CHECK-NEXT: sel z1.s, p4, z6.s, z2.s ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.s, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.s, p0/m, #0 // =0x0 @@ -659,8 +659,8 @@ define @test_signed_v8f16_v8i32( %f) { define @test_signed_v4f16_v4i16( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #63488 // =0xf800 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff ; CHECK-NEXT: mov z2.h, w8 @@ -682,8 +682,8 @@ define @test_signed_v4f16_v4i16( %f) { define @test_signed_v8f16_v8i16( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #63488 // =0xf800 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #30719 // =0x77ff ; CHECK-NEXT: mov z2.h, w8 @@ -705,8 +705,8 @@ define @test_signed_v8f16_v8i16( %f) { define @test_signed_v2f16_v2i64( %f) { ; CHECK-LABEL: test_signed_v2f16_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0x8000000000000000 ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #31743 // =0x7bff @@ -734,31 +734,31 @@ define @test_signed_v4f16_v4i64( %f) { ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #64511 // =0xfbff ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: mov z2.h, w8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #31743 // =0x7bff -; CHECK-NEXT: mov z3.d, #0x8000000000000000 -; CHECK-NEXT: mov z4.h, w8 +; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: mov z6.d, #0x7fffffffffffffff ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fcmge p2.h, p0/z, z0.h, z2.h -; CHECK-NEXT: movprfx z2, z1 -; CHECK-NEXT: fcvtzs z2.d, p0/m, z1.h +; CHECK-NEXT: mov z2.d, #0x8000000000000000 +; CHECK-NEXT: movprfx z4, z1 +; CHECK-NEXT: fcvtzs z4.d, p0/m, z1.h ; CHECK-NEXT: movprfx z5, z0 ; CHECK-NEXT: fcvtzs z5.d, p0/m, z0.h -; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z4.h -; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z4.h +; CHECK-NEXT: fcmgt p3.h, p0/z, z1.h, z3.h +; CHECK-NEXT: fcmgt p4.h, p0/z, z0.h, z3.h ; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b -; CHECK-NEXT: mov z2.d, p1/m, z3.d +; CHECK-NEXT: sel z3.d, p1, z2.d, z4.d ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z1.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z0.h, z0.h -; CHECK-NEXT: sel z3.d, p2, z3.d, z5.d -; CHECK-NEXT: sel z0.d, p3, z6.d, z2.d -; CHECK-NEXT: sel z1.d, p4, z6.d, z3.d +; CHECK-NEXT: sel z2.d, p2, z2.d, z5.d +; CHECK-NEXT: sel z0.d, p3, z6.d, z3.d +; CHECK-NEXT: sel z1.d, p4, z6.d, z2.d ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: mov z0.d, p1/m, #0 // =0x0 ; CHECK-NEXT: mov z1.d, p0/m, #0 // =0x0 diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll index c56c0b37888dc..ed352ffec339f 100644 --- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll +++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll @@ -82,8 +82,8 @@ define @test_signed_v4f32_v4i16( %f) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 ; CHECK-NEXT: movk w8, #18303, lsl #16 -; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: fcvtzu z2.s, p0/m, z0.s ; CHECK-NEXT: not p1.b, p0/z, p1.b @@ -102,9 +102,9 @@ define @test_signed_v8f32_v8i16( %f) { ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #65280 // =0xff00 ; CHECK-NEXT: movk w8, #18303, lsl #16 -; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z1.s, #0.0 ; CHECK-NEXT: fcmge p2.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.s, p0/m, z1.s ; CHECK-NEXT: movprfx z4, z0 @@ -146,10 +146,10 @@ define @test_signed_v2f32_v2i64( %f) { define @test_signed_v4f32_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: mov w8, #1602224127 // =0x5f7fffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.s, w8 ; CHECK-NEXT: fcmge p1.s, p0/z, z2.s, #0.0 ; CHECK-NEXT: fcmge p2.s, p0/z, z3.s, #0.0 @@ -186,8 +186,8 @@ define @test_signed_v2f64_v2i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: movprfx z2, z0 ; CHECK-NEXT: fcvtzu z2.d, p0/m, z0.d ; CHECK-NEXT: not p1.b, p0/z, p1.b @@ -206,9 +206,9 @@ define @test_signed_v4f64_v4i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 @@ -241,28 +241,28 @@ define @test_signed_v8f64_v8i32( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281474974613504 // =0xffffffe00000 ; CHECK-NEXT: movk x8, #16879, lsl #48 -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: fcmge p3.d, p0/z, z3.d, #0.0 ; CHECK-NEXT: fcmge p4.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: movprfx z5, z1 ; CHECK-NEXT: fcvtzu z5.d, p0/m, z1.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z0 ; CHECK-NEXT: fcvtzu z6.d, p0/m, z0.d ; CHECK-NEXT: movprfx z7, z3 ; CHECK-NEXT: fcvtzu z7.d, p0/m, z3.d ; CHECK-NEXT: movprfx z24, z2 ; CHECK-NEXT: fcvtzu z24.d, p0/m, z2.d +; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmgt p5.d, p0/z, z1.d, z4.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z0.d, z4.d -; CHECK-NEXT: mov z0.d, #0xffffffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: mov z0.d, #0xffffffff ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 ; CHECK-NEXT: fcmgt p1.d, p0/z, z3.d, z4.d +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: fcmgt p0.d, p0/z, z2.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 @@ -289,9 +289,9 @@ define @test_signed_v4f64_v4i16( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 -; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z0.d, #0.0 +; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: movprfx z3, z1 ; CHECK-NEXT: fcvtzu z3.d, p0/m, z1.d ; CHECK-NEXT: movprfx z4, z0 @@ -324,28 +324,28 @@ define @test_signed_v8f64_v8i16( %f) { ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, #281337537757184 // =0xffe000000000 ; CHECK-NEXT: movk x8, #16623, lsl #48 -; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: fcmge p1.d, p0/z, z3.d, #0.0 ; CHECK-NEXT: fcmge p2.d, p0/z, z2.d, #0.0 ; CHECK-NEXT: fcmge p3.d, p0/z, z1.d, #0.0 ; CHECK-NEXT: fcmge p4.d, p0/z, z0.d, #0.0 ; CHECK-NEXT: movprfx z5, z3 ; CHECK-NEXT: fcvtzu z5.d, p0/m, z3.d +; CHECK-NEXT: mov z4.d, x8 ; CHECK-NEXT: movprfx z6, z2 ; CHECK-NEXT: fcvtzu z6.d, p0/m, z2.d ; CHECK-NEXT: movprfx z7, z1 ; CHECK-NEXT: fcvtzu z7.d, p0/m, z1.d ; CHECK-NEXT: movprfx z24, z0 ; CHECK-NEXT: fcvtzu z24.d, p0/m, z0.d +; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: fcmgt p5.d, p0/z, z3.d, z4.d ; CHECK-NEXT: fcmgt p6.d, p0/z, z2.d, z4.d -; CHECK-NEXT: mov z2.d, #65535 // =0xffff -; CHECK-NEXT: not p1.b, p0/z, p1.b ; CHECK-NEXT: not p2.b, p0/z, p2.b +; CHECK-NEXT: mov z2.d, #65535 // =0xffff ; CHECK-NEXT: not p3.b, p0/z, p3.b -; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: mov z5.d, p1/m, #0 // =0x0 ; CHECK-NEXT: fcmgt p1.d, p0/z, z1.d, z4.d +; CHECK-NEXT: not p4.b, p0/z, p4.b ; CHECK-NEXT: fcmgt p0.d, p0/z, z0.d, z4.d ; CHECK-NEXT: mov z6.d, p2/m, #0 // =0x0 ; CHECK-NEXT: mov z7.d, p3/m, #0 // =0x0 @@ -465,10 +465,10 @@ define @test_signed_v4f16_v4i32( %f) { define @test_signed_v8f16_v8i32( %f) { ; CHECK-LABEL: test_signed_v8f16_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 @@ -549,10 +549,10 @@ define @test_signed_v2f16_v2i64( %f) { define @test_signed_v4f16_v4i64( %f) { ; CHECK-LABEL: test_signed_v4f16_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: mov w8, #31743 // =0x7bff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z4.h, w8 ; CHECK-NEXT: fcmge p1.h, p0/z, z2.h, #0.0 ; CHECK-NEXT: fcmge p2.h, p0/z, z3.h, #0.0 diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll index 5c4c9463528b8..ad6371f78ec08 100644 --- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll +++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll @@ -71,12 +71,12 @@ define @gather_i8_index_offset_8(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_f16_index_offset_var: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x1 -; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: uunpklo z3.d, z0.s ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: movprfx z4, z2 ; CHECK-NEXT: mla z4.d, p1/m, z1.d, z2.d @@ -101,16 +101,16 @@ define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov w8, #33554432 // =0x2000000 ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: mov w10, #67108864 // =0x4000000 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] @@ -131,17 +131,17 @@ define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov x8, #-33554433 // =0xfffffffffdffffff ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: mov x10, #-2 // =0xfffffffffffffffe +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: mov x10, #-2 // =0xfffffffffffffffe ; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: movk x10, #64511, lsl #16 ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: uunpkhi z0.d, z0.s ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] @@ -162,16 +162,16 @@ define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, < define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, %pg, %data) #0 { ; CHECK-LABEL: scatter_i8_index_stride_too_big: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov x8, #4611686018427387904 // =0x4000000000000000 ; CHECK-NEXT: uunpklo z2.d, z0.s -; CHECK-NEXT: index z1.d, #0, x8 ; CHECK-NEXT: rdvl x9, #1 -; CHECK-NEXT: add x8, x0, x1 +; CHECK-NEXT: index z1.d, #0, x8 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: lsr x9, x9, #4 +; CHECK-NEXT: add x8, x0, x1 ; CHECK-NEXT: mov x10, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z2.d }, p1, [x8, z1.d] ; CHECK-NEXT: madd x8, x9, x10, x8 ; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll index c73370d50287b..f90aef8daa5dc 100644 --- a/llvm/test/CodeGen/AArch64/sve-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -129,9 +129,9 @@ define @haddu_v2i32( %s0, @haddu_v2i16( %s0, @haddu_v4i16( %s0, @haddu_v4i8( %s0, %s ; ; SVE2-LABEL: haddu_v4i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: and z0.s, z0.s, #0xff ; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: uhadd z0.s, p0/m, z0.s, z1.s ; SVE2-NEXT: ret entry: @@ -557,9 +557,9 @@ define @haddu_v8i8( %s0, %s ; ; SVE2-LABEL: haddu_v8i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: and z0.h, z0.h, #0xff ; SVE2-NEXT: and z1.h, z1.h, #0xff +; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: uhadd z0.h, p0/m, z0.h, z1.h ; SVE2-NEXT: ret entry: @@ -787,9 +787,9 @@ define @rhaddu_v2i32( %s0, @rhaddu_v2i16( %s0, @rhaddu_v4i16( %s0, @rhaddu_v4i8( %s0, % ; ; SVE2-LABEL: rhaddu_v4i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: and z0.s, z0.s, #0xff ; SVE2-NEXT: and z1.s, z1.s, #0xff +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: urhadd z0.s, p0/m, z0.s, z1.s ; SVE2-NEXT: ret entry: @@ -1241,9 +1241,9 @@ define @rhaddu_v8i8( %s0, % ; ; SVE2-LABEL: rhaddu_v8i8: ; SVE2: // %bb.0: // %entry -; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: and z0.h, z0.h, #0xff ; SVE2-NEXT: and z1.h, z1.h, #0xff +; SVE2-NEXT: ptrue p0.h ; SVE2-NEXT: urhadd z0.h, p0/m, z0.h, z1.h ; SVE2-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll index e20399de70bf8..73bbee094827e 100644 --- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll +++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll @@ -175,14 +175,14 @@ define @uminv_zero_fill( %pg, @zero_fill_non_zero_index( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_non_zero_index: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: uminv d3, p0, z0.d ; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: cmpeq p0.d, p1/z, z1.d, z2.d +; CHECK-NEXT: fmov x8, d3 ; CHECK-NEXT: mov z0.d, p0/m, x8 ; CHECK-NEXT: ret %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64( %pg, %a) @@ -210,11 +210,11 @@ define @zero_fill_type_mismatch( %pg, @zero_fill_no_zero_upper_lanes( %pg, %a) #0 { ; CHECK-LABEL: zero_fill_no_zero_upper_lanes: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z0.d ; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: mov z1.d, p1/m, x8 +; CHECK-NEXT: mov z1.d, p0/m, x8 ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret %t1 = call @llvm.aarch64.sve.umin.nxv2i64( %pg, %a, %a) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll index 2aa298f6d9173..7344964f13bba 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll @@ -48,8 +48,8 @@ define @test_lane0_2xi64( %a) { define @test_lane0_2xf64( %a) { ; CHECK-LABEL: test_lane0_2xf64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: fmov d1, #1.00000000 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: mov z0.d, p0/m, z1.d ; CHECK-NEXT: ret %b = insertelement %a, double 1.0, i32 0 @@ -59,8 +59,8 @@ define @test_lane0_2xf64( %a) { define @test_lane0_4xf32( %a) { ; CHECK-LABEL: test_lane0_4xf32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: ptrue p0.s, vl1 ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret %b = insertelement %a, float 1.0, i32 0 @@ -70,8 +70,8 @@ define @test_lane0_4xf32( %a) { define @test_lane0_8xf16( %a) { ; CHECK-LABEL: test_lane0_8xf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: fmov h1, #1.00000000 +; CHECK-NEXT: ptrue p0.h, vl1 ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret %b = insertelement %a, half 1.0, i32 0 @@ -93,9 +93,9 @@ define @test_lane0_8xbf16( %a, bfloat define @test_lane4_2xi64( %a) { ; CHECK-LABEL: test_lane4_2xi64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #4 // =0x4 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d @@ -109,9 +109,9 @@ define @test_lane4_2xi64( %a) { define @test_lane9_8xf16( %a) { ; CHECK-LABEL: test_lane9_8xf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #9 // =0x9 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: fmov h1, #1.00000000 @@ -124,9 +124,9 @@ define @test_lane9_8xf16( %a) { define @test_lane9_8xbf16( %a, bfloat %x) { ; CHECK-LABEL: test_lane9_8xbf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #9 // =0x9 ; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h ; CHECK-NEXT: mov z0.h, p0/m, h1 @@ -138,9 +138,9 @@ define @test_lane9_8xbf16( %a, bfloat define @test_lane1_16xi8( %a) { ; CHECK-LABEL: test_lane1_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b @@ -153,9 +153,9 @@ define @test_lane1_16xi8( %a) { define @test_lanex_16xi8( %a, i32 %x) { ; CHECK-LABEL: test_lanex_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: mov w8, #30 // =0x1e ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b @@ -179,9 +179,9 @@ define @extract_insert_4xi32( %a) { define @test_lane6_undef_8xi16(i16 %a) { ; CHECK-LABEL: test_lane6_undef_8xi16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #6 // =0x6 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h ; CHECK-NEXT: mov z0.h, p0/m, w0 @@ -202,8 +202,8 @@ define @test_lane0_undef_16xi8(i8 %a) { define @test_insert0_of_extract0_16xi8( %a, %b) { ; CHECK-LABEL: test_insert0_of_extract0_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: ptrue p0.b, vl1 ; CHECK-NEXT: mov z0.b, p0/m, w8 ; CHECK-NEXT: ret %c = extractelement %b, i32 0 @@ -215,12 +215,12 @@ define @test_insert64_of_extract64_16xi8( % ; CHECK-LABEL: test_insert64_of_extract64_16xi8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #64 // =0x40 -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: whilels p0.b, xzr, x8 ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: lastb w9, p0, z1.b ; CHECK-NEXT: index z1.b, #0, #1 -; CHECK-NEXT: cmpeq p0.b, p1/z, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, w9 ; CHECK-NEXT: ret %c = extractelement %b, i32 64 @@ -231,9 +231,9 @@ define @test_insert64_of_extract64_16xi8( % define @test_insert3_of_extract1_16xi8( %a, %b) { ; CHECK-LABEL: test_insert3_of_extract1_16xi8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: umov w8, v1.b[1] ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -329,9 +329,9 @@ define @test_insert_into_undef_nxv2f64(double %a) { define @test_insert_with_index_nxv2f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -342,9 +342,9 @@ define @test_insert_with_index_nxv2f16(half %h, i64 %idx) { define @test_insert_with_index_nxv4f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -355,9 +355,9 @@ define @test_insert_with_index_nxv4f16(half %h, i64 %idx) { define @test_insert_with_index_nxv8f16(half %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -368,9 +368,9 @@ define @test_insert_with_index_nxv8f16(half %h, i64 %idx) { define @test_insert_with_index_nxv2bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -381,9 +381,9 @@ define @test_insert_with_index_nxv2bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv4bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -394,9 +394,9 @@ define @test_insert_with_index_nxv4bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv8bf16(bfloat %h, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, h0 ; CHECK-NEXT: ret @@ -407,9 +407,9 @@ define @test_insert_with_index_nxv8bf16(bfloat %h, i64 %id define @test_insert_with_index_nxv2f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret @@ -420,9 +420,9 @@ define @test_insert_with_index_nxv2f32(float %f, i64 %idx) define @test_insert_with_index_nxv4f32(float %f, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z1.s, #0, #1 ; CHECK-NEXT: mov z2.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, s0 ; CHECK-NEXT: ret @@ -433,9 +433,9 @@ define @test_insert_with_index_nxv4f32(float %f, i64 %idx) define @test_insert_with_index_nxv2f64(double %d, i64 %idx) { ; CHECK-LABEL: test_insert_with_index_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: index z1.d, #0, #1 ; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d ; CHECK-NEXT: mov z0.d, p0/m, d0 ; CHECK-NEXT: ret @@ -447,11 +447,11 @@ define @test_insert_with_index_nxv2f64(double %d, i64 %idx define @test_predicate_insert_2xi1_immediate ( %val, i1 %elt) { ; CHECK-LABEL: test_predicate_insert_2xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d, vl1 ; CHECK-NEXT: mov z0.d, p0/z, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d, vl1 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-NEXT: mov z0.d, p0/m, x0 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov z0.d, p1/m, x0 ; CHECK-NEXT: and z0.d, z0.d, #0x1 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ret @@ -462,9 +462,9 @@ define @test_predicate_insert_2xi1_immediate ( @test_predicate_insert_4xi1_immediate ( %val, i1 %elt) { ; CHECK-LABEL: test_predicate_insert_4xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 @@ -479,9 +479,9 @@ define @test_predicate_insert_4xi1_immediate ( @test_predicate_insert_8xi1_immediate ( %val, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_8xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov w8, w0 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: cmpeq p2.h, p1/z, z0.h, z1.h @@ -497,9 +497,9 @@ define @test_predicate_insert_8xi1_immediate ( @test_predicate_insert_16xi1_immediate ( %val) { ; CHECK-LABEL: test_predicate_insert_16xi1_immediate: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov w8, #4 // =0x4 ; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b @@ -516,9 +516,9 @@ define @test_predicate_insert_16xi1_immediate ( @test_predicate_insert_2xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_2xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: cmpeq p2.d, p1/z, z0.d, z1.d @@ -534,9 +534,9 @@ define @test_predicate_insert_2xi1( %val, i1 define @test_predicate_insert_4xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_4xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: cmpeq p2.s, p1/z, z0.s, z1.s ; CHECK-NEXT: mov z0.s, p0/z, #1 // =0x1 @@ -550,9 +550,9 @@ define @test_predicate_insert_4xi1( %val, i1 define @test_predicate_insert_8xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_8xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: cmpeq p2.h, p1/z, z0.h, z1.h ; CHECK-NEXT: mov z0.h, p0/z, #1 // =0x1 @@ -567,9 +567,9 @@ define @test_predicate_insert_8xi1( %val, i1 define @test_predicate_insert_16xi1( %val, i1 %elt, i32 %idx) { ; CHECK-LABEL: test_predicate_insert_16xi1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov w8, w1 ; CHECK-NEXT: index z0.b, #0, #1 +; CHECK-NEXT: mov w8, w1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: mov z1.b, w8 ; CHECK-NEXT: cmpeq p2.b, p1/z, z0.b, z1.b ; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 @@ -589,24 +589,24 @@ define @test_predicate_insert_32xi1( %val, ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: ptrue p2.b ; CHECK-NEXT: rdvl x8, #2 ; CHECK-NEXT: mov z0.b, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.b, p0/z, #1 // =0x1 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: mov w9, w1 +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmp x9, x8 ; CHECK-NEXT: csel x8, x9, x8, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1b { z0.b }, p2, [sp, #1, mul vl] -; CHECK-NEXT: st1b { z1.b }, p2, [sp] +; CHECK-NEXT: st1b { z0.b }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [sp] ; CHECK-NEXT: strb w0, [x9, x8] -; CHECK-NEXT: ld1b { z0.b }, p2/z, [sp] -; CHECK-NEXT: ld1b { z1.b }, p2/z, [sp, #1, mul vl] +; CHECK-NEXT: ld1b { z0.b }, p1/z, [sp] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [sp, #1, mul vl] ; CHECK-NEXT: and z0.b, z0.b, #0x1 ; CHECK-NEXT: and z1.b, z1.b, #0x1 -; CHECK-NEXT: cmpne p0.b, p2/z, z0.b, #0 -; CHECK-NEXT: cmpne p1.b, p2/z, z1.b, #0 +; CHECK-NEXT: cmpne p0.b, p1/z, z0.b, #0 +; CHECK-NEXT: cmpne p1.b, p1/z, z1.b, #0 ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll index 4a5e272582d8e..5efe9e2819d5e 100644 --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -17,15 +17,15 @@ define @insert_v2i64_nxv2i64_idx2( %vec, <2 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntd x8 ; CHECK-NEXT: mov w9, #2 // =0x2 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub x8, x8, #2 ; CHECK-NEXT: cmp x8, #2 +; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #3 -; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -51,15 +51,15 @@ define @insert_v4i32_nxv4i32_idx4( %vec, <4 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cntw x8 ; CHECK-NEXT: mov w9, #4 // =0x4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sub x8, x8, #4 ; CHECK-NEXT: cmp x8, #4 +; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #2 -; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -85,15 +85,15 @@ define @insert_v8i16_nxv8i16_idx8( %vec, <8 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #8 // =0x8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: sub x8, x8, #8 ; CHECK-NEXT: cmp x8, #8 +; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: lsl x8, x8, #1 -; CHECK-NEXT: st1h { z0.h }, p0, [sp] ; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1h { z0.h }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 @@ -119,15 +119,15 @@ define @insert_v16i8_nxv16i8_idx16( %vec, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, #16 // =0x10 ; CHECK-NEXT: sub x8, x8, #16 -; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: cmp x8, #16 -; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: st1b { z0.b }, p0, [sp] -; CHECK-NEXT: str q1, [x10, x8] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q1, [x9, x8] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -239,8 +239,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) uwtable { ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: str q0, [sp, #16] ; CHECK-NEXT: ld1d { z0.d }, p0/z, [sp, #1, mul vl] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll index c0ddceb42e1d0..52bd79e7a7e60 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll @@ -55,8 +55,8 @@ define @smax_i16_neg( %a) { define @smax_i16_out_of_range( %a) { ; CHECK-LABEL: smax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -93,8 +93,8 @@ define @smax_i32_neg( %a) { define @smax_i32_out_of_range( %a) { ; CHECK-LABEL: smax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -131,8 +131,8 @@ define @smax_i64_neg( %a) { define @smax_i64_out_of_range( %a) { ; CHECK-LABEL: smax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -196,8 +196,8 @@ define @smin_i16_neg( %a) { define @smin_i16_out_of_range( %a) { ; CHECK-LABEL: smin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -234,8 +234,8 @@ define @smin_i32_neg( %a) { define @smin_i32_out_of_range( %a) { ; CHECK-LABEL: smin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 -129, i32 0 @@ -272,8 +272,8 @@ define @smin_i64_neg( %a) { define @smin_i64_out_of_range( %a) { ; CHECK-LABEL: smin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -325,8 +325,8 @@ define @umax_i16_pos( %a) { define @umax_i16_out_of_range( %a) { ; CHECK-LABEL: umax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -351,8 +351,8 @@ define @umax_i32_pos( %a) { define @umax_i32_out_of_range( %a) { ; CHECK-LABEL: umax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -378,8 +378,8 @@ define @umax_i64_pos( %a) { define @umax_i64_out_of_range( %a) { ; CHECK-LABEL: umax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -431,8 +431,8 @@ define @umin_i16_pos( %a) { define @umin_i16_out_of_range( %a) { ; CHECK-LABEL: umin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 257, i32 0 @@ -457,8 +457,8 @@ define @umin_i32_pos( %a) { define @umin_i32_out_of_range( %a) { ; CHECK-LABEL: umin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -484,8 +484,8 @@ define @umin_i64_pos( %a) { define @umin_i64_out_of_range( %a) { ; CHECK-LABEL: umin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 65535, i32 0 @@ -589,8 +589,8 @@ define @mul_i64_pos( %a) { define @mul_i16_range( %a) { ; CHECK-LABEL: mul_i16_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %elt = insertelement undef, i16 255, i32 0 @@ -602,8 +602,8 @@ define @mul_i16_range( %a) { define @mul_i32_range( %a) { ; CHECK-LABEL: mul_i32_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #255 // =0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %elt = insertelement undef, i32 255, i32 0 @@ -615,8 +615,8 @@ define @mul_i32_range( %a) { define @mul_i64_range( %a) { ; CHECK-LABEL: mul_i64_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #255 // =0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %elt = insertelement undef, i64 255, i32 0 @@ -766,8 +766,8 @@ define @lsr_i64( %a){ define @sdiv_const( %a) #0 { ; CHECK-LABEL: sdiv_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: @@ -778,8 +778,8 @@ entry: define @udiv_const( %a) #0 { ; CHECK-LABEL: udiv_const: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #3 // =0x3 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll index fc2672f8c80a8..5f92dee3b5305 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll @@ -532,8 +532,8 @@ define @mls_i64( %a, %b, define @muladd_i64_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i64_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { @@ -545,8 +545,8 @@ define @mls_i64( %a, %b, define @muladd_i64_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i64_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, #0xffffffff00000001 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mad z0.d, p0/m, z1.d, z2.d ; CHECK-NEXT: ret { @@ -559,8 +559,8 @@ define @muladd_i64_negativeAddend( %a, @muladd_i32_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i32_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0x10000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { @@ -572,8 +572,8 @@ define @muladd_i32_positiveAddend( %a, @muladd_i32_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i32_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #0xffff0000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mad z0.s, p0/m, z1.s, z2.s ; CHECK-NEXT: ret { @@ -585,8 +585,8 @@ define @muladd_i32_negativeAddend( %a, @muladd_i16_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i16_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #255 // =0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret { @@ -598,8 +598,8 @@ define @muladd_i16_positiveAddend( %a, @muladd_i16_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i16_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, #-255 // =0xffffffffffffff01 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mad z0.h, p0/m, z1.h, z2.h ; CHECK-NEXT: ret { @@ -611,8 +611,8 @@ define @muladd_i16_negativeAddend( %a, @muladd_i8_positiveAddend( %a, %b) ; CHECK-LABEL: muladd_i8_positiveAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, #15 // =0xf +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret { @@ -624,8 +624,8 @@ define @muladd_i8_positiveAddend( %a, @muladd_i8_negativeAddend( %a, %b) ; CHECK-LABEL: muladd_i8_negativeAddend: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, #-15 // =0xfffffffffffffff1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mad z0.b, p0/m, z1.b, z2.b ; CHECK-NEXT: ret { @@ -748,8 +748,8 @@ define @mulsub_i8_negativeAddend( %a, @multiple_fused_ops( %a, %b) ; CHECK-LABEL: multiple_fused_ops: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #200 // =0xc8 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: mla z2.h, p0/m, z0.h, z1.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h @@ -770,8 +770,8 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) { ; CHECK-NEXT: b.lt .LBB70_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: cntw x10 diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll index d04da62451778..8c1b5225b7f25 100644 --- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll @@ -378,29 +378,29 @@ declare i8 @llvm.vector.reduce.smin.nxv10i8() define i8 @smin_nxv10i8( %a) { ; CHECK-LABEL: smin_nxv10i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.d, #127 // =0x7f +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z3.d, #127 // =0x7f ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpklo z3.s, z2.h -; CHECK-NEXT: uunpkhi z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z3.s, z3.s, z1.s -; CHECK-NEXT: uzp1 z2.h, z3.h, z2.h -; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b -; CHECK-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpkhi z3.d, z3.s -; CHECK-NEXT: uzp1 z3.s, z1.s, z3.s -; CHECK-NEXT: uzp1 z2.h, z2.h, z3.h -; CHECK-NEXT: uzp1 z2.b, z0.b, z2.b -; CHECK-NEXT: uunpkhi z2.h, z2.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z3.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -414,12 +414,12 @@ declare i8 @llvm.vector.reduce.add.nxv12i8() define i8 @uaddv_nxv12i8( %a) { ; CHECK-LABEL: uaddv_nxv12i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z2.s, #0 // =0x0 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov x0, d0 @@ -434,15 +434,15 @@ declare i8 @llvm.vector.reduce.umax.nxv14i8() define i8 @umax_nxv14i8( %a) { ; CHECK-LABEL: umax_nxv14i8: ; CHECK: // %bb.0: -; CHECK-NEXT: uunpkhi z2.h, z0.b -; CHECK-NEXT: mov z1.d, #0 // =0x0 +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: mov z3.d, #0 // =0x0 ; CHECK-NEXT: uunpklo z0.h, z0.b ; CHECK-NEXT: ptrue p0.b -; CHECK-NEXT: uunpkhi z3.s, z2.h -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z3.d, z3.s -; CHECK-NEXT: uzp1 z1.s, z3.s, z1.s -; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll index 2464eacd185dd..bc94c087ef5fe 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll @@ -239,10 +239,10 @@ define @index_rr_i32_combine(i32 %a, i32 %b) { define @index_rr_i32_not_combine(i32 %a, i32 %b) { ; CHECK-LABEL: index_rr_i32_not_combine: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z0.s, #0, #1 ; CHECK-NEXT: mov z1.s, w0 ; CHECK-NEXT: mov z2.s, w1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mla z1.s, p0/m, z0.s, z2.s ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll index 3e453a6b78179..5648e8244e6ec 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll @@ -247,8 +247,8 @@ define @sub_i32_ptrue_all_h( %a) #0 { define @sub_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: sub_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -402,8 +402,8 @@ define @subr_i32_ptrue_all_h( %a) #0 { define @subr_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: subr_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: subr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -449,8 +449,8 @@ define @smax_i16( %a) { define @smax_i16_out_of_range( %a) { ; CHECK-LABEL: smax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #129 // =0x81 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret @@ -480,8 +480,8 @@ define @smax_i32( %a) { define @smax_i32_out_of_range( %a) { ; CHECK-LABEL: smax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) @@ -510,8 +510,8 @@ define @smax_i64( %a) { define @smax_i64_out_of_range( %a) { ; CHECK-LABEL: smax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -559,8 +559,8 @@ define @smax_i32_ptrue_all_h( %a) #0 { define @smax_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: smax_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -606,8 +606,8 @@ define @smin_i16( %a) { define @smin_i16_out_of_range( %a) { ; CHECK-LABEL: smin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, #-129 // =0xffffffffffffff7f +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -636,8 +636,8 @@ define @smin_i32( %a) { define @smin_i32_out_of_range( %a) { ; CHECK-LABEL: smin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -668,8 +668,8 @@ define @smin_i64( %a) { define @smin_i64_out_of_range( %a) { ; CHECK-LABEL: smin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #-256 // =0xffffffffffffff00 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -717,8 +717,8 @@ define @smin_i32_ptrue_all_h( %a) #0 { define @smin_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: smin_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -764,8 +764,8 @@ define @umax_i16( %a) { define @umax_i16_out_of_range( %a) { ; CHECK-LABEL: umax_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -794,8 +794,8 @@ define @umax_i32( %a) { define @umax_i32_out_of_range( %a) { ; CHECK-LABEL: umax_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -825,8 +825,8 @@ define @umax_i64( %a) { define @umax_i64_out_of_range( %a) { ; CHECK-LABEL: umax_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -874,8 +874,8 @@ define @umax_i32_ptrue_all_h( %a) #0 { define @umax_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: umax_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -921,8 +921,8 @@ define @umin_i16( %a) { define @umin_i16_out_of_range( %a) { ; CHECK-LABEL: umin_i16_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: dupm z1.b, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) @@ -951,8 +951,8 @@ define @umin_i32( %a) { define @umin_i32_out_of_range( %a) { ; CHECK-LABEL: umin_i32_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #257 // =0x101 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret @@ -982,8 +982,8 @@ define @umin_i64( %a) { define @umin_i64_out_of_range( %a) { ; CHECK-LABEL: umin_i64_out_of_range: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %pg = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -1031,8 +1031,8 @@ define @umin_i32_ptrue_all_h( %a) #0 { define @umin_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: umin_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) @@ -2120,8 +2120,8 @@ define @mul_i32_ptrue_all_h( %a) #0 { define @mul_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: mul_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll index 7cdedeee2cada..5db7ee75c2a8d 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll @@ -261,8 +261,8 @@ define @orr_i32_ptrue_all_h( %a) { define @orr_i32_ptrue_all_d( %a) { ; CHECK-LABEL: orr_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #65535 // =0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: orr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll index da5dc5c5b34d9..619134dc4a696 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll @@ -38,18 +38,18 @@ define @test_post_ld1_dup(ptr %a, ptr %ptr, i64 %inc) { define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_int_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 -; CHECK-NEXT: ptrue p1.d, vl1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: ldr x8, [x0] -; CHECK-NEXT: ldr x9, [x0, x1, lsl #3] +; CHECK-NEXT: ptrue p2.d, vl1 ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] -; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ldr x9, [x0, x1, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: mov z0.d, z2.d -; CHECK-NEXT: mov z2.d, p2/m, x9 -; CHECK-NEXT: mov z0.d, p1/m, x8 +; CHECK-NEXT: mov z0.d, p2/m, x8 +; CHECK-NEXT: mov z2.d, p1/m, x9 ; CHECK-NEXT: add z0.d, z0.d, z2.d ; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret @@ -67,18 +67,18 @@ define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_pt define void @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr) #1 { ; CHECK-LABEL: test_post_ld1_double_fixed: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #2 // =0x2 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 -; CHECK-NEXT: ptrue p1.d, vl1 -; CHECK-NEXT: ld1d { z2.d }, p0/z, [x2] -; CHECK-NEXT: cmpeq p2.d, p0/z, z0.d, z1.d -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ldr d1, [x0, x1, lsl #3] -; CHECK-NEXT: sel z0.d, p1, z0.d, z2.d -; CHECK-NEXT: mov z2.d, p2/m, d1 -; CHECK-NEXT: fadd z0.d, z0.d, z2.d +; CHECK-NEXT: ptrue p2.d, vl1 +; CHECK-NEXT: ldr d2, [x0, x1, lsl #3] +; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x2] +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: sel z1.d, p2, z1.d, z0.d +; CHECK-NEXT: mov z0.d, p1/m, d2 +; CHECK-NEXT: fadd z0.d, z1.d, z0.d ; CHECK-NEXT: st1d { z0.d }, p0, [x3] ; CHECK-NEXT: ret %A = load <4 x double>, ptr %addr diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll index e42e2272a2d4f..fbe82e8591fd0 100644 --- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll +++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll @@ -20,8 +20,8 @@ define @ld1r_stack() { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: adrp x8, :got:g8 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: ldr x8, [x8, :got_lo12:g8] ; CHECK-NEXT: ldrb w8, [x8] ; CHECK-NEXT: strb w8, [sp, #12] @@ -1433,10 +1433,10 @@ define ptr @avoid_preindex_load(ptr %src, ptr %out) { define ptr @avoid_preindex_load_dup(ptr %src, %pg, ptr %out) { ; CHECK-LABEL: avoid_preindex_load_dup: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1450,10 +1450,10 @@ define ptr @avoid_preindex_load_dup(ptr %src, %pg, ptr %out) { define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, %pg, ptr %out) { ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1rsb { z0.d }, p0/z, [x0, #1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add x0, x0, #1 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1467,10 +1467,10 @@ define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, %p define ptr @preindex_load_dup_passthru( %passthru, ptr %src, %pg, ptr %out) { ; CHECK-LABEL: preindex_load_dup_passthru: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ldrsb x8, [x0, #1]! ; CHECK-NEXT: mov z0.d, p0/m, x8 -; CHECK-NEXT: st1d { z0.d }, p1, [x1] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: ret %ptr = getelementptr inbounds i8, ptr %src, i64 1 %tmp = load i8, ptr %ptr, align 4 @@ -1485,8 +1485,8 @@ define ptr @preindex_load_dup_passthru( %passthru, ptr %src, < define ptr @preidx8sext64_instead_of_ld1r(ptr %src, ptr %out, ptr %dst) { ; CHECK-LABEL: preidx8sext64_instead_of_ld1r: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldrsb x8, [x0, #1]! +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: st1d { z0.d }, p0, [x1] ; CHECK-NEXT: str x8, [x2] diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll index 06ec132808154..5d53c00c52728 100644 --- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll +++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll @@ -38,8 +38,8 @@ define void @ld_st_nxv8i16(ptr %in, ptr %out) { ; ; ASM-LABEL: ld_st_nxv8i16: ; ASM: // %bb.0: // %entry -; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov x8, xzr ; ASM-NEXT: cnth x9 ; ASM-NEXT: .LBB0_1: // %loop @@ -111,8 +111,8 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) { ; ; ASM-LABEL: masked_ld_st_nxv8i16: ; ASM: // %bb.0: // %entry -; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov z0.h, #3 // =0x3 +; ASM-NEXT: ptrue p0.h ; ASM-NEXT: mov x8, xzr ; ASM-NEXT: cnth x9 ; ASM-NEXT: .LBB1_1: // %loop diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll index e40d65efb158b..dfdfc456ccdba 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -126,9 +126,9 @@ define @masked_gather_nxv8f16( %ptrs, @masked_gather_nxv8bf16(ptr %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z0.h ; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] @@ -175,14 +175,14 @@ define @masked_gather_nxv8f32(ptr %base, define @masked_gather_nxv16i8(ptr %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.h, z0.b +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpklo z0.h, z0.b ; CHECK-NEXT: punpklo p0.h, p0.b -; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: sunpkhi z2.s, z1.h +; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: ld1b { z2.s }, p2/z, [x0, z2.s, sxtw] ; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, z1.s, sxtw] ; CHECK-NEXT: punpkhi p1.h, p0.b diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll index 40d889f1b501e..d397424cb162f 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll @@ -235,9 +235,9 @@ define @masked_sload_x2_8i8_8i64(ptr %a, ptr %b, @masked_zload_nxv8i16(ptr %a, %mask) define @masked_zload_2i16_2f64(ptr noalias %in, %mask) { ; CHECK-LABEL: masked_zload_2i16_2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] -; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ret %wide.load = call @llvm.masked.load.nxv2i16(ptr %in, i32 2, %mask, undef) %zext = zext %wide.load to @@ -230,9 +230,9 @@ define @masked_zload_x2_8i8_8i64(ptr %a, ptr %b, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.h, z1.b ; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEXT: uunpkhi z0.h, z0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: sunpklo z3.s, z2.h ; CHECK-NEXT: uunpklo z5.s, z4.h ; CHECK-NEXT: sunpkhi z2.s, z2.h -; CHECK-NEXT: punpklo p2.h, p1.b -; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: st1b { z5.s }, p2, [x0, z3.s, sxtw] ; CHECK-NEXT: uunpkhi z3.s, z4.h ; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpklo z3.s, z0.h ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] ; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, sxtw] ; CHECK-NEXT: ret @@ -40,12 +40,12 @@ define void @masked_scatter_nxv16i8( %data, ptr %base, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret @@ -57,12 +57,12 @@ define void @masked_scatter_nxv8i16( %data, ptr %base, %data, ptr %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll index e866474942cd7..94e525d22b825 100644 --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll @@ -76,8 +76,8 @@ define void @masked_scatter_nxv2f64( %data, %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body -; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z0.d, #0 // =0x0 +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z0.d }, p1, [z0.d] ; CHECK-NEXT: st1w { z0.d }, p0, [z0.d] diff --git a/llvm/test/CodeGen/AArch64/sve-pr62151.ll b/llvm/test/CodeGen/AArch64/sve-pr62151.ll index 7cec20fda429c..5ed34f14a0b14 100644 --- a/llvm/test/CodeGen/AArch64/sve-pr62151.ll +++ b/llvm/test/CodeGen/AArch64/sve-pr62151.ll @@ -5,8 +5,8 @@ define i32 @build_interpolation(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) { ; CHECK-LABEL: build_interpolation: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $z2 ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: mla v0.2s, v1.2s, v0.s[1] diff --git a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll index 4d46ac5ecbaa9..6e08606db9537 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll @@ -54,24 +54,24 @@ define aarch64_sve_vector_pcs @add_nxv64i1( ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue p4.b ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue p6.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ldr p4, [x0] +; CHECK-NEXT: ldr p5, [x1] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ldr p8, [x3] -; CHECK-NEXT: eor p0.b, p4/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p4/z, p1.b, p6.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p2.b, p4/z, p2.b, p7.b +; CHECK-NEXT: eor p0.b, p6/z, p0.b, p4.b +; CHECK-NEXT: eor p1.b, p6/z, p1.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: eor p2.b, p6/z, p2.b, p7.b ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p3.b, p4/z, p3.b, p8.b +; CHECK-NEXT: eor p3.b, p6/z, p3.b, p8.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 @@ -138,24 +138,24 @@ define aarch64_sve_vector_pcs @sub_nxv64i1( ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ptrue p4.b ; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ptrue p6.b ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: ldr p5, [x0] -; CHECK-NEXT: ldr p6, [x1] +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: ldr p4, [x0] +; CHECK-NEXT: ldr p5, [x1] ; CHECK-NEXT: ldr p7, [x2] ; CHECK-NEXT: ldr p8, [x3] -; CHECK-NEXT: eor p0.b, p4/z, p0.b, p5.b -; CHECK-NEXT: eor p1.b, p4/z, p1.b, p6.b -; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p2.b, p4/z, p2.b, p7.b +; CHECK-NEXT: eor p0.b, p6/z, p0.b, p4.b +; CHECK-NEXT: eor p1.b, p6/z, p1.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: eor p2.b, p6/z, p2.b, p7.b ; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: eor p3.b, p4/z, p3.b, p8.b +; CHECK-NEXT: eor p3.b, p6/z, p3.b, p8.b ; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: .cfi_def_cfa wsp, 16 diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll index 600e9c4805ff7..8438e9d88f5de 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll @@ -322,10 +322,10 @@ entry: define @ornot_v4i32( %z, %x, %y) { ; CHECK-LABEL: ornot_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z3.s, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret @@ -340,10 +340,10 @@ entry: define @ornot_v8i16( %z, %x, %y) { ; CHECK-LABEL: ornot_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret @@ -358,10 +358,10 @@ entry: define @ornot_v16i8( %z, %x, %y) { ; CHECK-LABEL: ornot_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, #-1 // =0xffffffffffffffff -; CHECK-NEXT: eor z2.d, z2.d, z3.d +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: eor z2.d, z2.d, z3.d ; CHECK-NEXT: orr z1.d, z1.d, z2.d ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret @@ -904,8 +904,8 @@ define @addqr_v4i32( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: add z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -922,8 +922,8 @@ define @addqr_v8i16( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: add z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -940,8 +940,8 @@ define @addqr_v16i8( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: add z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -958,8 +958,8 @@ define @subqr_v4i32( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sub z1.s, z1.s, z2.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -976,8 +976,8 @@ define @subqr_v8i16( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sub z1.h, z1.h, z2.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -994,8 +994,8 @@ define @subqr_v16i8( %z, ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sub z1.b, z1.b, z2.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1010,10 +1010,10 @@ entry: define @mulqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: mulqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: mul z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: mul z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1028,10 +1028,10 @@ entry: define @mulqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: mulqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: mul z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: mul z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1046,10 +1046,10 @@ entry: define @mulqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: mulqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: mul z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: mul z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1064,11 +1064,11 @@ entry: define @faddqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: faddqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1083,11 +1083,11 @@ entry: define @faddqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: faddqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fadd z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1102,11 +1102,11 @@ entry: define @fsubqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: fsubqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fsub z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1121,11 +1121,11 @@ entry: define @fsubqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: fsubqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fsub z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1140,11 +1140,11 @@ entry: define @fmulqr_v4f32( %z, %x, float %y) { ; CHECK-LABEL: fmulqr_v4f32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $s2 killed $s2 def $z2 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, s2 -; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fmul z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1159,11 +1159,11 @@ entry: define @fmulqr_v8f16( %z, %x, half %y) { ; CHECK-LABEL: fmulqr_v8f16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $h2 killed $h2 def $z2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, h2 -; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 +; CHECK-NEXT: fmul z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1178,10 +1178,10 @@ entry: define @sadd_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: sadd_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sqadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sqadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1196,10 +1196,10 @@ entry: define @sadd_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: sadd_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sqadd z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sqadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1214,10 +1214,10 @@ entry: define @sadd_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: sadd_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sqadd z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sqadd z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1232,10 +1232,10 @@ entry: define @uadd_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: uadd_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: uqadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: uqadd z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1250,10 +1250,10 @@ entry: define @uadd_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: uadd_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: uqadd z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: uqadd z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1268,10 +1268,10 @@ entry: define @uadd_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: uadd_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: uqadd z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: uqadd z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1286,10 +1286,10 @@ entry: define @ssub_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: ssub_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: sqsub z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: sqsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1304,10 +1304,10 @@ entry: define @ssub_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: ssub_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: sqsub z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: sqsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1322,10 +1322,10 @@ entry: define @ssub_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: ssub_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: sqsub z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: sqsub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: @@ -1340,10 +1340,10 @@ entry: define @usub_satqr_v4i32( %z, %x, i32 %y) { ; CHECK-LABEL: usub_satqr_v4i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w0 -; CHECK-NEXT: uqsub z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: uqsub z1.s, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, z1.s ; CHECK-NEXT: ret entry: @@ -1358,10 +1358,10 @@ entry: define @usub_satqr_v8i16( %z, %x, i16 %y) { ; CHECK-LABEL: usub_satqr_v8i16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 -; CHECK-NEXT: uqsub z1.h, z1.h, z2.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: uqsub z1.h, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, z1.h ; CHECK-NEXT: ret entry: @@ -1376,10 +1376,10 @@ entry: define @usub_satqr_v16i8( %z, %x, i8 %y) { ; CHECK-LABEL: usub_satqr_v16i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 -; CHECK-NEXT: uqsub z1.b, z1.b, z2.b +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 +; CHECK-NEXT: uqsub z1.b, z1.b, z2.b ; CHECK-NEXT: mov z0.b, p0/m, z1.b ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll index 14bc1b45e79ee..2541910e080e3 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll @@ -202,9 +202,9 @@ entry: define @sdiv_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: sdiv_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z0.h @@ -288,9 +288,9 @@ entry: define @udiv_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: udiv_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z0.h @@ -376,17 +376,17 @@ entry: define @srem_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: srem_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret entry: %c = icmp sgt %n, zeroinitializer @@ -419,8 +419,8 @@ define @srem_nxv16i8_x( %x, %n, zeroinitializer @@ -464,17 +464,17 @@ entry: define @urem_nxv8i16_x( %x, %y, %n) { ; CHECK-LABEL: urem_nxv8i16_x: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: mls z0.h, p0/m, z2.h, z1.h ; CHECK-NEXT: ret entry: %c = icmp sgt %n, zeroinitializer @@ -507,8 +507,8 @@ define @urem_nxv16i8_x( %x, %n, zeroinitializer @@ -1140,8 +1140,8 @@ define @fdiv_nxv8f16_x( %x, @sdiv_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: sdiv_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h @@ -1740,9 +1740,9 @@ entry: define @udiv_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: udiv_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h @@ -1830,17 +1830,17 @@ entry: define @srem_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: srem_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z3.s, z1.h ; CHECK-NEXT: sunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z5.s, z0.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: sunpklo z4.s, z1.h ; CHECK-NEXT: sdivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: msb z1.h, p0/m, z3.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: msb z1.h, p0/m, z2.h, z0.h ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -1874,8 +1874,8 @@ define @srem_nxv16i8_y( %x, @urem_nxv8i16_y( %x, %y, %n) { ; CHECK-LABEL: urem_nxv8i16_y: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpkhi z3.s, z1.h ; CHECK-NEXT: uunpkhi z4.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z5.s, z0.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: udivr z4.s, p0/m, z4.s, z5.s ; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, #0 -; CHECK-NEXT: uzp1 z3.h, z4.h, z3.h -; CHECK-NEXT: msb z1.h, p0/m, z3.h, z0.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z3.h +; CHECK-NEXT: msb z1.h, p0/m, z2.h, z0.h ; CHECK-NEXT: mov z0.d, z1.d ; CHECK-NEXT: ret entry: @@ -1966,8 +1966,8 @@ define @urem_nxv16i8_y( %x, @fdiv_nxv8f16_y( %x, @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -2871,8 +2871,8 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -2887,8 +2887,8 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -2903,8 +2903,8 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -2920,8 +2920,8 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -2937,8 +2937,8 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll index 0f09f7dac2982..bafd5abcc7b23 100644 --- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll +++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll @@ -792,8 +792,8 @@ define @fdiv_nxv8f16_x( %x, @fdiv_nxv8f16_y( %x, @fmai_nxv4f32_y( %x, %n, zeroinitializer @@ -1750,8 +1750,8 @@ define @fmai_nxv8f16_y( %x, %n, zeroinitializer @@ -1766,8 +1766,8 @@ define @fmai_nxv2f64_y( %x, %n, zeroinitializer @@ -1782,8 +1782,8 @@ define @fma_nxv4f32_y( %x, %n, zeroinitializer @@ -1799,8 +1799,8 @@ define @fma_nxv8f16_y( %x, %n, zeroinitializer @@ -1816,8 +1816,8 @@ define @fma_nxv2f64_y( %x, %n, zeroinitializer diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll index fbadbf7226fd1..8bd38d7bc44df 100644 --- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll @@ -320,8 +320,8 @@ define i1 @cmp32_ptest_any_xx( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp8_ptest_first_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpge p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret @@ -338,8 +338,8 @@ define i1 @cmp8_ptest_first_ax( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp8_ptest_last_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: cmpge p0.b, p0/z, z0.b, z1.b +; CHECK-NEXT: ptrue p1.b ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret @@ -371,8 +371,8 @@ define i1 @cmp8_ptest_any_ax( %pg, %a, %pg, %a, %b) { ; CHECK-LABEL: cmp32_ptest_first_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: cmpge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret @@ -390,8 +390,8 @@ define i1 @cmp32_ptest_first_ax( %pg, %a, < define i1 @cmp32_ptest_last_ax( %pg, %a, %b) { ; CHECK-LABEL: cmp32_ptest_last_ax: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: cmpge p0.s, p0/z, z0.s, z1.s +; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ptest p1, p0.b ; CHECK-NEXT: cset w0, lo ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll index 6873404724f1d..508fe5d5a58a5 100644 --- a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll @@ -35,8 +35,8 @@ entry: define void @keep_scalable_store(ptr writeonly %ptr, ptr %a, %b) { ; CHECK-LABEL: keep_scalable_store: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q2, q1, [x1] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: stp q2, q1, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll index a1c2ec9c7e1d4..76190eba870de 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll @@ -22,15 +22,15 @@ define i8 @split_extract_32i8_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1b { z0.b }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrb w0, [x9, x8] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -46,15 +46,15 @@ define i16 @split_extract_16i16_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -70,15 +70,15 @@ define i32 @split_extract_8i32_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -94,15 +94,15 @@ define i64 @split_extract_8i64_idx( %a, i32 %idx) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, w0 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x9, x8 -; CHECK-NEXT: csel x8, x9, x8, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] +; CHECK-NEXT: csel x8, x9, x8, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] @@ -140,15 +140,15 @@ define i16 @split_extract_16i16( %a) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #128 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldrh w0, [x9, x8, lsl #1] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -164,16 +164,16 @@ define i32 @split_extract_16i32( %a) { ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: mov w9, #34464 // =0x86a0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w9, #1, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, x9 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z3.s }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1w { z2.s }, p0, [sp, #2, mul vl] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: ldr w0, [x9, x8, lsl #2] @@ -191,15 +191,15 @@ define i64 @split_extract_4i64( %a) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cntw x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w9, #10 // =0xa ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #10 -; CHECK-NEXT: csel x8, x8, x9, lo -; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] +; CHECK-NEXT: csel x8, x8, x9, lo +; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: ldr x0, [x9, x8, lsl #3] ; CHECK-NEXT: addvl sp, sp, #2 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll index 3997409172d03..bc015116917d8 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -6,9 +6,9 @@ define @fcvts_nxv8f16( %a) { ; CHECK-LABEL: fcvts_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z1.s, z0.h ; CHECK-NEXT: uunpkhi z2.s, z0.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.s, p0/m, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -21,9 +21,9 @@ define @fcvts_nxv8f16( %a) { define @fcvtd_nxv4f16( %a) { ; CHECK-LABEL: fcvtd_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.d, p0/m, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -37,8 +37,8 @@ define @fcvtd_nxv8f16( %a) { ; CHECK-LABEL: fcvtd_nxv8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: uunpklo z1.s, z0.h -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z1.s ; CHECK-NEXT: uunpkhi z1.d, z1.s ; CHECK-NEXT: uunpklo z3.d, z0.s @@ -58,9 +58,9 @@ define @fcvtd_nxv8f16( %a) { define @fcvtd_nxv4f32( %a) { ; CHECK-LABEL: fcvtd_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvt z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -73,11 +73,11 @@ define @fcvtd_nxv4f32( %a) { define @fcvtd_nxv8f32( %a) { ; CHECK-LABEL: fcvtd_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z2.d, z0.s ; CHECK-NEXT: uunpkhi z3.d, z0.s ; CHECK-NEXT: uunpklo z4.d, z1.s ; CHECK-NEXT: uunpkhi z5.d, z1.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z2 ; CHECK-NEXT: fcvt z0.d, p0/m, z2.s ; CHECK-NEXT: movprfx z1, z3 @@ -195,9 +195,9 @@ define @fcvtzs_h_nxv8f64( %a) { define @fcvtzs_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzs_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -210,11 +210,11 @@ define @fcvtzs_d_nxv4f32( %a) { define @fcvtzs_s_nxv16f16( %a) { ; CHECK-LABEL: fcvtzs_s_nxv16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uunpklo z2.s, z0.h ; CHECK-NEXT: uunpkhi z3.s, z0.h ; CHECK-NEXT: uunpklo z4.s, z1.h ; CHECK-NEXT: uunpkhi z5.s, z1.h +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z2 ; CHECK-NEXT: fcvtzs z0.s, p0/m, z2.h ; CHECK-NEXT: movprfx z1, z3 @@ -247,9 +247,9 @@ define @fcvtzu_s_nxv4f64( %a) { define @fcvtzu_d_nxv4f32( %a) { ; CHECK-LABEL: fcvtzu_d_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -295,8 +295,8 @@ define @scvtf_s_nxv16i8( %a) { ; CHECK-LABEL: scvtf_s_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: sunpklo z1.h, z0.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z0.h, z0.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpkhi z1.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h @@ -316,9 +316,9 @@ define @scvtf_s_nxv16i8( %a) { define @scvtf_d_nxv4i32( %a) { ; CHECK-LABEL: scvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sunpklo z1.d, z0.s ; CHECK-NEXT: sunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: scvtf z0.d, p0/m, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -333,8 +333,8 @@ define @scvtf_d_nxv4i1( %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p2/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: scvtf z0.d, p1/m, z0.d ; CHECK-NEXT: scvtf z1.d, p1/m, z1.d @@ -378,9 +378,9 @@ define @ucvtf_h_nxv8i64( %a) { define @ucvtf_d_nxv4i32( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uunpklo z1.d, z0.s ; CHECK-NEXT: uunpkhi z2.d, z0.s +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: movprfx z0, z1 ; CHECK-NEXT: ucvtf z0.d, p0/m, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -395,8 +395,8 @@ define @ucvtf_d_nxv4i1( %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b -; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z0.d, p2/z, #1 // =0x1 +; CHECK-NEXT: ptrue p1.d ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ucvtf z0.d, p1/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p1/m, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll index 7f642882eddbe..696b6c34ef041 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll @@ -23,8 +23,8 @@ define double @fadda_nxv8f64(double %init, %a) { define float @faddv_nxv8f32(float %init, %a) { ; CHECK-LABEL: faddv_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fadd z1.s, z1.s, z2.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll index 5441659fa5cb4..75366384cb750 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll @@ -6,9 +6,9 @@ define @promote_insert_8i8( %a, i8 %elt, i64 %idx) { ; CHECK-LABEL: promote_insert_8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: index z1.h, #0, #1 ; CHECK-NEXT: mov z2.h, w1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h ; CHECK-NEXT: mov z0.h, p0/m, w0 ; CHECK-NEXT: ret @@ -23,13 +23,13 @@ define @split_insert_32i8_idx( %a, i8 %elt, ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1b { z1.b }, p0, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1b { z0.b }, p0, [sp] ; CHECK-NEXT: strb w0, [x9, x8] ; CHECK-NEXT: ld1b { z0.b }, p0/z, [sp] @@ -48,13 +48,13 @@ define @split_insert_8f32_idx( %a, floa ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x0, x8 -; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] +; CHECK-NEXT: csel x8, x0, x8, lo ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str s2, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] @@ -73,13 +73,13 @@ define @split_insert_8i64_idx( %a, i64 %elt ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cnth x8 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x1, x8 -; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1d { z3.d }, p0, [sp, #3, mul vl] +; CHECK-NEXT: csel x8, x1, x8, lo ; CHECK-NEXT: st1d { z2.d }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1d { z1.d }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [sp] @@ -100,9 +100,9 @@ define @split_insert_8i64_idx( %a, i64 %elt define @promote_insert_4i16( %a, i16 %elt) { ; CHECK-LABEL: promote_insert_4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s ; CHECK-NEXT: mov z0.s, p0/m, w0 @@ -117,9 +117,9 @@ define @promote_insert_4i16( %a, i16 %elt) define @split_insert_32i8( %a, i8 %elt) { ; CHECK-LABEL: split_insert_32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b ; CHECK-NEXT: mov z0.b, p0/m, w0 @@ -135,14 +135,14 @@ define @split_insert_32i16( %a, i16 %elt) ; CHECK-NEXT: addvl sp, sp, #-4 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rdvl x8, #2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w9, #128 // =0x80 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1h { z3.h }, p0, [sp, #3, mul vl] ; CHECK-NEXT: st1h { z2.h }, p0, [sp, #2, mul vl] ; CHECK-NEXT: st1h { z1.h }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [sp] @@ -165,15 +165,15 @@ define @split_insert_8i32( %a, i32 %elt) { ; CHECK-NEXT: addvl sp, sp, #-2 ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cnth x8 ; CHECK-NEXT: mov w9, #16960 // =0x4240 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movk w9, #15, lsl #16 ; CHECK-NEXT: sub x8, x8, #1 ; CHECK-NEXT: cmp x8, x9 +; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: csel x8, x8, x9, lo ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: st1w { z1.s }, p0, [sp, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [sp] ; CHECK-NEXT: str w0, [x9, x8, lsl #2] ; CHECK-NEXT: ld1w { z0.s }, p0/z, [sp] diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll index 42f3a163d14cc..dd7b15ef5ee6f 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll @@ -17,8 +17,8 @@ define i8 @andv_nxv8i8( %a) { define i32 @andv_nxv8i32( %a) { ; CHECK-LABEL: andv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -71,8 +71,8 @@ define i16 @xorv_nxv2i16( %a) { define i32 @xorv_nxv8i32( %a) { ; CHECK-LABEL: xorv_nxv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eor z0.d, z0.d, z1.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -97,8 +97,8 @@ define i16 @uaddv_nxv4i16( %a) { define i16 @uaddv_nxv16i16( %a) { ; CHECK-LABEL: uaddv_nxv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: add z0.h, z0.h, z1.h +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 @@ -127,8 +127,8 @@ define i32 @uaddv_nxv16i32( %a) { define i32 @umin_nxv2i32( %a) { ; CHECK-LABEL: umin_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll index af03059cf0d8b..754f0339702dc 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll @@ -93,8 +93,8 @@ define @masked_load_split_32i16(ptr %a, % ; CHECK-NEXT: punpklo p2.h, p0.b ; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: punpklo p3.h, p1.b -; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0] +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1h { z2.h }, p3/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0, #3, mul vl] @@ -123,8 +123,8 @@ define @masked_load_split_8i64(ptr %a, %pg) ; CHECK-NEXT: punpklo p2.h, p1.b ; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: punpklo p3.h, p0.b -; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0] +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl] ; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl] diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll index 90ec783ea4dbc..affa9a18ac182 100644 --- a/llvm/test/CodeGen/AArch64/sve-split-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll @@ -81,8 +81,8 @@ define void @masked_store_split_32i16( %data, ptr %a, %data, ptr %a, %data, ptr %a, i32 1, %pg) diff --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll index 9c3d4b1e5a810..f556d60d23b88 100644 --- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll +++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll @@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu" define @srem_combine_loop( %a) #0 { ; CHECK-LABEL: srem_combine_loop: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, #2 // =0x2 ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #1 ; CHECK-NEXT: mls z0.s, p0/m, z1.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll index 728041d3f916b..3273e6b384f63 100644 --- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll @@ -105,8 +105,8 @@ define void @st1d_inbound( %data, ptr %a) { define void @store_nxv2f32(ptr %out) { ; CHECK-LABEL: store_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: fmov z0.s, #1.00000000 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: st1w { z0.d }, p0, [x0] ; CHECK-NEXT: ret %ins = insertelement undef, float 1.0, i32 0 @@ -118,8 +118,8 @@ define void @store_nxv2f32(ptr %out) { define void @store_nxv4f16(ptr %out) { ; CHECK-LABEL: store_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: fmov z0.h, #1.00000000 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret %ins = insertelement undef, half 1.0, i32 0 @@ -133,9 +133,9 @@ define void @store_nxv4f16(ptr %out) { define void @store_nxv6f32(ptr %out) { ; CHECK-LABEL: store_nxv6f32: ; CHECK: // %bb.0: +; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ptrue p1.s -; CHECK-NEXT: fmov z0.s, #1.00000000 ; CHECK-NEXT: st1w { z0.d }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1w { z0.s }, p1, [x0] ; CHECK-NEXT: ret @@ -148,9 +148,9 @@ define void @store_nxv6f32(ptr %out) { define void @store_nxv12f16(ptr %out) { ; CHECK-LABEL: store_nxv12f16: ; CHECK: // %bb.0: +; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ptrue p1.h -; CHECK-NEXT: fmov z0.h, #1.00000000 ; CHECK-NEXT: st1h { z0.s }, p0, [x0, #2, mul vl] ; CHECK-NEXT: st1h { z0.h }, p1, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll index 6f5a31248de7e..4c5f27d3e7093 100644 --- a/llvm/test/CodeGen/AArch64/sve-stepvector.ll +++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll @@ -208,9 +208,9 @@ entry: define @multiple_use_stepvector_nxv4i32_1(i32 %data) { ; CHECK-LABEL: multiple_use_stepvector_nxv4i32_1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: index z0.s, w0, #1 ; CHECK-NEXT: mov z1.s, w0 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z1.s, p0/m, z1.s, z0.s ; CHECK-NEXT: sub z0.s, z1.s, z0.s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll index be5c318e675df..d547f99a0230a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @ctlz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctlz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: sub z0.h, z0.h, #8 // =0x8 @@ -49,8 +49,8 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) { define void @ctlz_v32i8(ptr %a) { ; CHECK-LABEL: ctlz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: clz z0.b, p0/m, z0.b ; CHECK-NEXT: clz z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -64,8 +64,8 @@ define void @ctlz_v32i8(ptr %a) { define <2 x i16> @ctlz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctlz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: sub z0.s, z0.s, #16 // =0x10 @@ -102,8 +102,8 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) { define void @ctlz_v16i16(ptr %a) { ; CHECK-LABEL: ctlz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: clz z0.h, p0/m, z0.h ; CHECK-NEXT: clz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -141,8 +141,8 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) { define void @ctlz_v8i32(ptr %a) { ; CHECK-LABEL: ctlz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: clz z0.s, p0/m, z0.s ; CHECK-NEXT: clz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -180,8 +180,8 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) { define void @ctlz_v4i64(ptr %a) { ; CHECK-LABEL: ctlz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: clz z0.d, p0/m, z0.d ; CHECK-NEXT: clz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -199,8 +199,8 @@ define void @ctlz_v4i64(ptr %a) { define <4 x i8> @ctpop_v4i8(<4 x i8> %op) { ; CHECK-LABEL: ctpop_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -236,8 +236,8 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) { define void @ctpop_v32i8(ptr %a) { ; CHECK-LABEL: ctpop_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cnt z0.b, p0/m, z0.b ; CHECK-NEXT: cnt z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -251,8 +251,8 @@ define void @ctpop_v32i8(ptr %a) { define <2 x i16> @ctpop_v2i16(<2 x i16> %op) { ; CHECK-LABEL: ctpop_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -288,8 +288,8 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) { define void @ctpop_v16i16(ptr %a) { ; CHECK-LABEL: ctpop_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: cnt z0.h, p0/m, z0.h ; CHECK-NEXT: cnt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -327,8 +327,8 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) { define void @ctpop_v8i32(ptr %a) { ; CHECK-LABEL: ctpop_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: cnt z0.s, p0/m, z0.s ; CHECK-NEXT: cnt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -366,8 +366,8 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) { define void @ctpop_v4i64(ptr %a) { ; CHECK-LABEL: ctpop_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cnt z0.d, p0/m, z0.d ; CHECK-NEXT: cnt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -385,8 +385,8 @@ define void @ctpop_v4i64(ptr %a) { define <4 x i8> @cttz_v4i8(<4 x i8> %op) { ; CHECK-LABEL: cttz_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: orr z0.h, z0.h, #0x100 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: clz z0.h, p0/m, z0.h @@ -425,8 +425,8 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) { define void @cttz_v32i8(ptr %a) { ; CHECK-LABEL: cttz_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: clz z0.b, p0/m, z0.b @@ -442,8 +442,8 @@ define void @cttz_v32i8(ptr %a) { define <2 x i16> @cttz_v2i16(<2 x i16> %op) { ; CHECK-LABEL: cttz_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: orr z0.s, z0.s, #0x10000 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: clz z0.s, p0/m, z0.s @@ -482,8 +482,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) { define void @cttz_v16i16(ptr %a) { ; CHECK-LABEL: cttz_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: clz z0.h, p0/m, z0.h @@ -525,8 +525,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) { define void @cttz_v8i32(ptr %a) { ; CHECK-LABEL: cttz_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: clz z0.s, p0/m, z0.s @@ -568,8 +568,8 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) { define void @cttz_v4i64(ptr %a) { ; CHECK-LABEL: cttz_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: clz z0.d, p0/m, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll index 251a7c3b18a9f..0aefba2d4c6ab 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll @@ -169,15 +169,15 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap) { ; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, x8, lsl #1] ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: mov x9, #10 // =0xa ; CHECK-NEXT: ld1h { z3.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: mov x10, #12 // =0xc +; CHECK-NEXT: mov x9, #10 // =0xa ; CHECK-NEXT: ld1h { z4.d }, p0/z, [x0, x8, lsl #1] -; CHECK-NEXT: mov x8, #14 // =0xe +; CHECK-NEXT: mov x8, #12 // =0xc +; CHECK-NEXT: mov x10, #14 // =0xe ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] ; CHECK-NEXT: ld1h { z5.d }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x10, lsl #1] -; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z6.d }, p0/z, [x0, x8, lsl #1] +; CHECK-NEXT: ld1h { z7.d }, p0/z, [x0, x10, lsl #1] ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1 ; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index 2ace0bca274af..0d6675def8b52 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -241,8 +241,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.d ; SVE-NEXT: ldr q0, [x1] ; SVE-NEXT: ldr d1, [x0] -; SVE-NEXT: and z1.s, z1.s, #0x7fffffff ; SVE-NEXT: fcvt z0.s, p0/m, z0.d +; SVE-NEXT: and z1.s, z1.s, #0x7fffffff ; SVE-NEXT: uzp1 z0.s, z0.s, z0.s ; SVE-NEXT: and z0.s, z0.s, #0x80000000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -274,8 +274,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) { define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v4f32_v4f64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d ; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: ptrue p0.d ; SVE-NEXT: fcvt z1.s, p0/m, z1.d ; SVE-NEXT: fcvt z0.s, p0/m, z0.d ; SVE-NEXT: ptrue p0.s, vl2 @@ -291,8 +291,8 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v4f32_v4f64: ; SVE2: // %bb.0: -; SVE2-NEXT: ptrue p0.d ; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ptrue p0.d ; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: fcvt z1.s, p0/m, z1.d ; SVE2-NEXT: fcvt z0.s, p0/m, z0.d @@ -319,8 +319,8 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) { ; SVE: // %bb.0: ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldr q0, [x0] -; SVE-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; SVE-NEXT: ld1w { z1.d }, p0/z, [x1] +; SVE-NEXT: and z0.d, z0.d, #0x7fffffffffffffff ; SVE-NEXT: fcvt z1.d, p0/m, z1.s ; SVE-NEXT: and z1.d, z1.d, #0x8000000000000000 ; SVE-NEXT: orr z0.d, z0.d, z1.d @@ -354,10 +354,10 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: mov x8, #2 // =0x2 ; SVE-NEXT: ldp q2, q3, [x0] -; SVE-NEXT: and z2.d, z2.d, #0x7fffffffffffffff -; SVE-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; SVE-NEXT: ld1w { z0.d }, p0/z, [x1] ; SVE-NEXT: ld1w { z1.d }, p0/z, [x1, x8, lsl #2] +; SVE-NEXT: and z2.d, z2.d, #0x7fffffffffffffff +; SVE-NEXT: and z3.d, z3.d, #0x7fffffffffffffff ; SVE-NEXT: fcvt z0.d, p0/m, z0.s ; SVE-NEXT: fcvt z1.d, p0/m, z1.s ; SVE-NEXT: and z0.d, z0.d, #0x8000000000000000 @@ -397,8 +397,8 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) { ; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldr q0, [x1] ; SVE-NEXT: ldr d1, [x0] -; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: fcvt z0.h, p0/m, z0.s +; SVE-NEXT: and z1.h, z1.h, #0x7fff ; SVE-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE-NEXT: and z0.h, z0.h, #0x8000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -429,13 +429,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE: // %bb.0: ; SVE-NEXT: ldp q0, q1, [x1] ; SVE-NEXT: ptrue p0.s, vl2 -; SVE-NEXT: ptrue p1.s ; SVE-NEXT: fcvtxn v1.2s, v1.2d ; SVE-NEXT: fcvtxn v0.2s, v0.2d ; SVE-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldr d1, [x0] ; SVE-NEXT: and z1.h, z1.h, #0x7fff -; SVE-NEXT: fcvt z0.h, p1/m, z0.s +; SVE-NEXT: fcvt z0.h, p0/m, z0.s ; SVE-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE-NEXT: and z0.h, z0.h, #0x8000 ; SVE-NEXT: orr z0.d, z1.d, z0.d @@ -446,13 +446,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { ; SVE2: // %bb.0: ; SVE2-NEXT: ldp q0, q1, [x1] ; SVE2-NEXT: ptrue p0.s, vl2 -; SVE2-NEXT: ptrue p1.s ; SVE2-NEXT: ldr d2, [x0] ; SVE2-NEXT: fcvtxn v1.2s, v1.2d ; SVE2-NEXT: fcvtxn v0.2s, v0.2d ; SVE2-NEXT: splice z0.s, p0, z0.s, z1.s +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: mov z1.h, #32767 // =0x7fff -; SVE2-NEXT: fcvt z0.h, p1/m, z0.s +; SVE2-NEXT: fcvt z0.h, p0/m, z0.s ; SVE2-NEXT: uzp1 z0.h, z0.h, z0.h ; SVE2-NEXT: bsl z2.d, z2.d, z0.d, z1.d ; SVE2-NEXT: str d2, [x0] @@ -470,8 +470,8 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) { define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; SVE-LABEL: test_copysign_v8f16_v8f32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s ; SVE-NEXT: ldp q0, q1, [x1] +; SVE-NEXT: ptrue p0.s ; SVE-NEXT: fcvt z1.h, p0/m, z1.s ; SVE-NEXT: fcvt z0.h, p0/m, z0.s ; SVE-NEXT: ptrue p0.h, vl4 @@ -487,8 +487,8 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) { ; ; SVE2-LABEL: test_copysign_v8f16_v8f32: ; SVE2: // %bb.0: -; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: ldp q0, q1, [x1] +; SVE2-NEXT: ptrue p0.s ; SVE2-NEXT: ldr q2, [x0] ; SVE2-NEXT: fcvt z1.h, p0/m, z1.s ; SVE2-NEXT: fcvt z0.h, p0/m, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll index c436dea8ff1b2..c2d6ed4e9ccf9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll @@ -50,8 +50,8 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -94,8 +94,8 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -125,8 +125,8 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -186,8 +186,8 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fdiv_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -230,8 +230,8 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fdiv_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -261,8 +261,8 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fdiv_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fdiv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -325,8 +325,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h @@ -373,8 +373,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s @@ -407,8 +407,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d @@ -470,8 +470,8 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmul_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -514,8 +514,8 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmul_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -545,8 +545,8 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmul_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmul_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -603,8 +603,8 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) { define void @fneg_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fneg_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fneg z0.h, p0/m, z0.h ; CHECK-NEXT: fneg z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -642,8 +642,8 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) { define void @fneg_v8f32(ptr %a) { ; CHECK-LABEL: fneg_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fneg z0.s, p0/m, z0.s ; CHECK-NEXT: fneg z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -669,8 +669,8 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) { define void @fneg_v4f64(ptr %a) { ; CHECK-LABEL: fneg_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fneg z0.d, p0/m, z0.d ; CHECK-NEXT: fneg z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -724,8 +724,8 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) { define void @fsqrt_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fsqrt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fsqrt z0.h, p0/m, z0.h ; CHECK-NEXT: fsqrt z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -763,8 +763,8 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) { define void @fsqrt_v8f32(ptr %a) { ; CHECK-LABEL: fsqrt_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s ; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -790,8 +790,8 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) { define void @fsqrt_v4f64(ptr %a) { ; CHECK-LABEL: fsqrt_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fsqrt z0.d, p0/m, z0.d ; CHECK-NEXT: fsqrt z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -848,8 +848,8 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fsub_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -892,8 +892,8 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fsub_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -923,8 +923,8 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fsub_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fsub_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fsubr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -981,8 +981,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) { define void @fabs_v16f16(ptr %a) { ; CHECK-LABEL: fabs_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fabs z0.h, p0/m, z0.h ; CHECK-NEXT: fabs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -1020,8 +1020,8 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) { define void @fabs_v8f32(ptr %a) { ; CHECK-LABEL: fabs_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fabs z0.s, p0/m, z0.s ; CHECK-NEXT: fabs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -1047,8 +1047,8 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) { define void @fabs_v4f64(ptr %a) { ; CHECK-LABEL: fabs_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fabs z0.d, p0/m, z0.d ; CHECK-NEXT: fabs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll index aad078f035f7d..e92694d1fc80d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll @@ -57,8 +57,8 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -107,8 +107,8 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s @@ -157,8 +157,8 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oeq_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d @@ -181,8 +181,8 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) { define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ueq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p2.h, p0/z, z1.h, z0.h @@ -209,8 +209,8 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_one_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p2.h, p0/z, z1.h, z0.h @@ -237,8 +237,8 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_une_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h @@ -261,8 +261,8 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ogt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -285,8 +285,8 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ugt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h @@ -312,8 +312,8 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_olt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -336,8 +336,8 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ult_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -363,8 +363,8 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_oge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -387,8 +387,8 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_uge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -414,8 +414,8 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ole_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h @@ -438,8 +438,8 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ule_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -465,8 +465,8 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_uno_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h @@ -489,8 +489,8 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ord_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmuo p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmuo p0.h, p0/z, z2.h, z3.h @@ -516,8 +516,8 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_eq_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -540,8 +540,8 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ne_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmne p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmne p0.h, p0/z, z2.h, z3.h @@ -564,8 +564,8 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_gt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z2.h, z3.h @@ -588,8 +588,8 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_lt_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmgt p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmgt p0.h, p0/z, z3.h, z2.h @@ -612,8 +612,8 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_ge_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: fcmge p0.h, p0/z, z2.h, z3.h @@ -636,8 +636,8 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) { define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fcmp_le_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fcmge p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmge p0.h, p0/z, z3.h, z2.h diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll index 18f9a4d371d0c..9bdde14e8d83d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll @@ -8,9 +8,9 @@ target triple = "aarch64-unknown-linux-gnu" define void @fp_convert_combine_crash(ptr %a, ptr %b) { ; CHECK-LABEL: fp_convert_combine_crash: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmov z0.s, #8.00000000 +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmul z1.s, p0/m, z1.s, z0.s ; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z2.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll index 28e02da53af43..244a405101739 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll @@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v2f16_to_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str d0, [x0] @@ -25,8 +25,8 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) { define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f16_to_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvt z0.s, p0/m, z0.h ; CHECK-NEXT: str q0, [x0] @@ -371,8 +371,8 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) { define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v8f32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov x8, #4 // =0x4 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: fcvt z1.h, p0/m, z1.s @@ -420,8 +420,8 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) { define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: fcvt z0.h, p0/m, z0.d ; CHECK-NEXT: fcvt z1.h, p0/m, z1.d @@ -467,8 +467,8 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) { define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvt_v4f64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: mov x8, #2 // =0x2 ; CHECK-NEXT: fcvt z0.s, p0/m, z0.d ; CHECK-NEXT: fcvt z1.s, p0/m, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll index b5df97f767c13..478be9ab76dd9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll @@ -40,8 +40,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3) define void @fma_v16f16(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.h, p0/m, z2.h, z1.h @@ -91,8 +91,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o define void @fma_v8f32(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.s, p0/m, z2.s, z1.s @@ -140,8 +140,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double define void @fma_v4f64(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: fma_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q4, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q5, [x2] ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fmad z0.d, p0/m, z2.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll index 07a67e2650290..4dc034adf459a 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll @@ -37,8 +37,8 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmaxnm_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -81,8 +81,8 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmaxnm_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -123,8 +123,8 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmaxnm_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmaxnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -171,8 +171,8 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fminnm_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -215,8 +215,8 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fminnm_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -257,8 +257,8 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fminnm_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fminnm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -305,8 +305,8 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmax_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -349,8 +349,8 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmax_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -391,8 +391,8 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmax_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmax_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -439,8 +439,8 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) { define void @fmin_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -483,8 +483,8 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) { define void @fmin_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -525,8 +525,8 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) { define void @fmin_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fmin_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll index d2d771c48c204..bd10a0e091c0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll @@ -211,8 +211,8 @@ define half @faddv_v8f16(half %start, <8 x half> %a) { define half @faddv_v16f16(half %start, ptr %a) { ; CHECK-LABEL: faddv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fadd z1.h, p0/m, z1.h, z2.h ; CHECK-NEXT: faddv h1, p0, z1.h ; CHECK-NEXT: fadd h0, h0, h1 @@ -249,8 +249,8 @@ define float @faddv_v4f32(float %start, <4 x float> %a) { define float @faddv_v8f32(float %start, ptr %a) { ; CHECK-LABEL: faddv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s ; CHECK-NEXT: faddv s1, p0, z1.s ; CHECK-NEXT: fadd s0, s0, s1 @@ -285,8 +285,8 @@ define double @faddv_v2f64(double %start, <2 x double> %a) { define double @faddv_v4f64(double %start, ptr %a) { ; CHECK-LABEL: faddv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fadd z1.d, p0/m, z1.d, z2.d ; CHECK-NEXT: faddv d1, p0, z1.d ; CHECK-NEXT: fadd d0, d0, d1 @@ -327,8 +327,8 @@ define half @fmaxv_v8f16(<8 x half> %a) { define half @fmaxv_v16f16(ptr %a) { ; CHECK-LABEL: fmaxv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmaxnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fmaxnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -365,8 +365,8 @@ define float @fmaxv_v4f32(<4 x float> %a) { define float @fmaxv_v8f32(ptr %a) { ; CHECK-LABEL: fmaxv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmaxnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -401,8 +401,8 @@ define double @fmaxv_v2f64(<2 x double> %a) { define double @fmaxv_v4f64(ptr %a) { ; CHECK-LABEL: fmaxv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmaxnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fmaxnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -443,8 +443,8 @@ define half @fminv_v8f16(<8 x half> %a) { define half @fminv_v16f16(ptr %a) { ; CHECK-LABEL: fminv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fminnm z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fminnmv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -481,8 +481,8 @@ define float @fminv_v4f32(<4 x float> %a) { define float @fminv_v8f32(ptr %a) { ; CHECK-LABEL: fminv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fminnm z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fminnmv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -517,8 +517,8 @@ define double @fminv_v2f64(<2 x double> %a) { define double @fminv_v4f64(ptr %a) { ; CHECK-LABEL: fminv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fminnm z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fminnmv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -559,8 +559,8 @@ define half @fmaximumv_v8f16(<8 x half> %a) { define half @fmaximumv_v16f16(ptr %a) { ; CHECK-LABEL: fmaximumv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fmaxv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -597,8 +597,8 @@ define float @fmaximumv_v4f32(<4 x float> %a) { define float @fmaximumv_v8f32(ptr %a) { ; CHECK-LABEL: fmaximumv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fmaxv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -633,8 +633,8 @@ define double @fmaximumv_v2f64(<2 x double> %a) { define double @fmaximumv_v4f64(ptr %a) { ; CHECK-LABEL: fmaximumv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fmaxv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -675,8 +675,8 @@ define half @fminimumv_v8f16(<8 x half> %a) { define half @fminimumv_v16f16(ptr %a) { ; CHECK-LABEL: fminimumv_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fmin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: fminv h0, p0, z0.h ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $z0 @@ -713,8 +713,8 @@ define float @fminimumv_v4f32(<4 x float> %a) { define float @fminimumv_v8f32(ptr %a) { ; CHECK-LABEL: fminimumv_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: fminv s0, p0, z0.s ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $z0 @@ -749,8 +749,8 @@ define double @fminimumv_v2f64(<2 x double> %a) { define double @fminimumv_v4f64(ptr %a) { ; CHECK-LABEL: fminimumv_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fmin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: fminv d0, p0, z0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll index 580b43531070f..24832d807c649 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll @@ -47,8 +47,8 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) { define void @frintp_v16f16(ptr %a) { ; CHECK-LABEL: frintp_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintp z0.h, p0/m, z0.h ; CHECK-NEXT: frintp z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -86,8 +86,8 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) { define void @frintp_v8f32(ptr %a) { ; CHECK-LABEL: frintp_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintp z0.s, p0/m, z0.s ; CHECK-NEXT: frintp z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -123,8 +123,8 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) { define void @frintp_v4f64(ptr %a) { ; CHECK-LABEL: frintp_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintp z0.d, p0/m, z0.d ; CHECK-NEXT: frintp z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -178,8 +178,8 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) { define void @frintm_v16f16(ptr %a) { ; CHECK-LABEL: frintm_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintm z0.h, p0/m, z0.h ; CHECK-NEXT: frintm z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -217,8 +217,8 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) { define void @frintm_v8f32(ptr %a) { ; CHECK-LABEL: frintm_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintm z0.s, p0/m, z0.s ; CHECK-NEXT: frintm z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -254,8 +254,8 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) { define void @frintm_v4f64(ptr %a) { ; CHECK-LABEL: frintm_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintm z0.d, p0/m, z0.d ; CHECK-NEXT: frintm z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -309,8 +309,8 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) { define void @frinti_v16f16(ptr %a) { ; CHECK-LABEL: frinti_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frinti z0.h, p0/m, z0.h ; CHECK-NEXT: frinti z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -348,8 +348,8 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) { define void @frinti_v8f32(ptr %a) { ; CHECK-LABEL: frinti_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frinti z0.s, p0/m, z0.s ; CHECK-NEXT: frinti z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -385,8 +385,8 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) { define void @frinti_v4f64(ptr %a) { ; CHECK-LABEL: frinti_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frinti z0.d, p0/m, z0.d ; CHECK-NEXT: frinti z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -440,8 +440,8 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) { define void @frintx_v16f16(ptr %a) { ; CHECK-LABEL: frintx_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintx z0.h, p0/m, z0.h ; CHECK-NEXT: frintx z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -479,8 +479,8 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) { define void @frintx_v8f32(ptr %a) { ; CHECK-LABEL: frintx_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintx z0.s, p0/m, z0.s ; CHECK-NEXT: frintx z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -516,8 +516,8 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) { define void @frintx_v4f64(ptr %a) { ; CHECK-LABEL: frintx_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintx z0.d, p0/m, z0.d ; CHECK-NEXT: frintx z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -571,8 +571,8 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) { define void @frinta_v16f16(ptr %a) { ; CHECK-LABEL: frinta_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frinta z0.h, p0/m, z0.h ; CHECK-NEXT: frinta z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -610,8 +610,8 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) { define void @frinta_v8f32(ptr %a) { ; CHECK-LABEL: frinta_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frinta z0.s, p0/m, z0.s ; CHECK-NEXT: frinta z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -647,8 +647,8 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) { define void @frinta_v4f64(ptr %a) { ; CHECK-LABEL: frinta_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frinta z0.d, p0/m, z0.d ; CHECK-NEXT: frinta z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -702,8 +702,8 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) { define void @frintn_v16f16(ptr %a) { ; CHECK-LABEL: frintn_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintn z0.h, p0/m, z0.h ; CHECK-NEXT: frintn z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -741,8 +741,8 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) { define void @frintn_v8f32(ptr %a) { ; CHECK-LABEL: frintn_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintn z0.s, p0/m, z0.s ; CHECK-NEXT: frintn z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -778,8 +778,8 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) { define void @frintn_v4f64(ptr %a) { ; CHECK-LABEL: frintn_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintn z0.d, p0/m, z0.d ; CHECK-NEXT: frintn z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -833,8 +833,8 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) { define void @frintz_v16f16(ptr %a) { ; CHECK-LABEL: frintz_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: frintz z0.h, p0/m, z0.h ; CHECK-NEXT: frintz z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -872,8 +872,8 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) { define void @frintz_v8f32(ptr %a) { ; CHECK-LABEL: frintz_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: frintz z0.s, p0/m, z0.s ; CHECK-NEXT: frintz z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -909,8 +909,8 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) { define void @frintz_v4f64(ptr %a) { ; CHECK-LABEL: frintz_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: frintz z0.d, p0/m, z0.d ; CHECK-NEXT: frintz z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll index 73fd7e1465343..132225546fc4f 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -23,8 +23,8 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) { define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -39,8 +39,8 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) { define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { ; CHECK-LABEL: select_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -55,8 +55,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) { define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z0.h, w2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -77,8 +77,8 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) { define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -93,8 +93,8 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) { define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { ; CHECK-LABEL: select_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -109,8 +109,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) { define void @select_v8f32(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -150,9 +150,9 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) { ; CHECK-LABEL: select_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -167,9 +167,9 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask define void @select_v4f64(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll index d6adf9cf0ad67..58eae212d7999 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll @@ -36,8 +36,8 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) { define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzu z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -55,8 +55,8 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) { define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { ; CHECK-LABEL: fcvtzu_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -68,8 +68,8 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) { define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) { ; CHECK-LABEL: fcvtzu_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -302,8 +302,8 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) { define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzu_v8f32_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -320,8 +320,8 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) { define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v16f32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s @@ -373,8 +373,8 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) { define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -392,8 +392,8 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { ; CHECK-LABEL: fcvtzu_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -405,8 +405,8 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) { define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) { ; CHECK-LABEL: fcvtzu_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -491,8 +491,8 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -520,8 +520,8 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d @@ -563,24 +563,22 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldr q6, [x0, #112] -; CHECK-NEXT: ldp q4, q5, [x0, #80] -; CHECK-NEXT: ldr q7, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s ; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z16.s, z1.s[1] @@ -606,25 +604,26 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.s, z5.s[1] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z6.s[1] +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #20] +; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: strh w8, [sp, #20] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: ldp q1, q0, [sp] ; CHECK-NEXT: stp q1, q0, [x1] @@ -669,8 +668,8 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) { define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { ; CHECK-LABEL: fcvtzu_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -687,8 +686,8 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) { define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d @@ -740,8 +739,8 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) { define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzu_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] @@ -785,8 +784,8 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) { define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f16_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h ; CHECK-NEXT: fcvtzs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -804,8 +803,8 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) { define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { ; CHECK-LABEL: fcvtzs_v2f16_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -817,8 +816,8 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) { define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) { ; CHECK-LABEL: fcvtzs_v4f16_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1052,8 +1051,8 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) { define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { ; CHECK-LABEL: fcvtzs_v8f32_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -1070,8 +1069,8 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) { define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v16f32_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s @@ -1123,8 +1122,8 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) { define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v8f32_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s ; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -1142,8 +1141,8 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) { define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { ; CHECK-LABEL: fcvtzs_v1f32_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -1155,8 +1154,8 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) { define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) { ; CHECK-LABEL: fcvtzs_v2f32_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1243,8 +1242,8 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1272,8 +1271,8 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d @@ -1315,24 +1314,22 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK: // %bb.0: ; CHECK-NEXT: sub sp, sp, #32 ; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q3, q2, [x0] -; CHECK-NEXT: ldr q6, [x0, #112] -; CHECK-NEXT: ldp q4, q5, [x0, #80] -; CHECK-NEXT: ldr q7, [x0, #64] +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d -; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d +; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d +; CHECK-NEXT: ldp q6, q7, [x0, #64] ; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d -; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d ; CHECK-NEXT: fcvtzs z5.d, p0/m, z5.d ; CHECK-NEXT: fcvtzs z4.d, p0/m, z4.d ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: fcvtzs z6.d, p0/m, z6.d +; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s ; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z16.s, z1.s[1] @@ -1358,25 +1355,26 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z3.s, z5.s[1] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: mov z2.s, z6.s[1] +; CHECK-NEXT: uzp1 z2.s, z6.s, z6.s ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s6 -; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: strh w8, [sp, #24] +; CHECK-NEXT: strh w8, [sp, #28] ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: mov z1.s, z1.s[1] -; CHECK-NEXT: strh w8, [sp, #20] +; CHECK-NEXT: strh w8, [sp, #24] ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: mov z0.s, z0.s[1] -; CHECK-NEXT: strh w8, [sp, #16] +; CHECK-NEXT: strh w8, [sp, #20] ; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: strh w8, [sp, #30] +; CHECK-NEXT: mov z2.s, z2.s[1] +; CHECK-NEXT: strh w8, [sp, #16] ; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: strh w8, [sp, #26] +; CHECK-NEXT: strh w8, [sp, #30] ; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strh w8, [sp, #22] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w8, [sp, #18] ; CHECK-NEXT: ldp q1, q0, [sp] ; CHECK-NEXT: stp q1, q0, [x1] @@ -1421,8 +1419,8 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) { define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { ; CHECK-LABEL: fcvtzs_v4f64_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -1439,8 +1437,8 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) { define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v8f64_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d @@ -1492,8 +1490,8 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) { define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: fcvtzs_v4f64_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d ; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll index ee8704284def5..4c5a6fe2fd231 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll @@ -71,8 +71,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask define void @select_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: fcmeq p0.h, p0/z, z2.h, z3.h @@ -128,8 +128,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m define void @select_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: fcmeq p0.s, p0/z, z2.s, z3.s @@ -186,8 +186,8 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> define void @select_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: fcmeq p0.d, p0/z, z2.d, z3.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll index 0b3e7695e6a0a..4aa965777c742 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll @@ -11,9 +11,9 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { ; CHECK-LABEL: insertelement_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -28,9 +28,9 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) { define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { ; CHECK-LABEL: insertelement_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -45,9 +45,9 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) { define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { ; CHECK-LABEL: insertelement_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z1.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -62,9 +62,9 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) { define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { ; CHECK-LABEL: insertelement_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov w8, #15 // =0xf ; CHECK-NEXT: index z2.b, #0, #1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z3.b, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -80,9 +80,9 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) { define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { ; CHECK-LABEL: insertelement_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -97,9 +97,9 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) { define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { ; CHECK-LABEL: insertelement_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -114,9 +114,9 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) { define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { ; CHECK-LABEL: insertelement_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -131,9 +131,9 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) { define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { ; CHECK-LABEL: insertelement_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z2.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z3.h, w8 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -149,9 +149,9 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) { define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { ; CHECK-LABEL: insertelement_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -166,9 +166,9 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) { define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { ; CHECK-LABEL: insertelement_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -183,9 +183,9 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) { define <8 x i32> @insertelement_v8i32(ptr %a) { ; CHECK-LABEL: insertelement_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s @@ -212,9 +212,9 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) { define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { ; CHECK-LABEL: insertelement_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov w8, #5 // =0x5 @@ -229,9 +229,9 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) { define <4 x i64> @insertelement_v4i64(ptr %a) { ; CHECK-LABEL: insertelement_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: mov w8, #5 // =0x5 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d @@ -264,9 +264,9 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) { define <4 x half> @insertelement_v4f16(<4 x half> %op1) { ; CHECK-LABEL: insertelement_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h @@ -281,9 +281,9 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) { define <8 x half> @insertelement_v8f16(<8 x half> %op1) { ; CHECK-LABEL: insertelement_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z1.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.h, p0/z, z1.h, z2.h @@ -298,9 +298,9 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) { define <16 x half> @insertelement_v16f16(ptr %a) { ; CHECK-LABEL: insertelement_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov w8, #7 // =0x7 ; CHECK-NEXT: index z0.h, #0, #1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z1.h, w8 ; CHECK-NEXT: fmov h2, #5.00000000 ; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, z1.h @@ -317,9 +317,9 @@ define <16 x half> @insertelement_v16f16(ptr %a) { define <2 x float> @insertelement_v2f32(<2 x float> %op1) { ; CHECK-LABEL: insertelement_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s @@ -334,9 +334,9 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) { define <4 x float> @insertelement_v4f32(<4 x float> %op1) { ; CHECK-LABEL: insertelement_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z1.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z2.s, w8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.s, p0/z, z1.s, z2.s @@ -351,9 +351,9 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) { define <8 x float> @insertelement_v8f32(ptr %a) { ; CHECK-LABEL: insertelement_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov w8, #3 // =0x3 ; CHECK-NEXT: index z0.s, #0, #1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z1.s, w8 ; CHECK-NEXT: fmov s2, #5.00000000 ; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s @@ -379,9 +379,9 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) { define <2 x double> @insertelement_v2f64(<2 x double> %op1) { ; CHECK-LABEL: insertelement_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z1.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z2.d, x8 ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: cmpeq p0.d, p0/z, z1.d, z2.d @@ -396,9 +396,9 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) { define <4 x double> @insertelement_v4f64(ptr %a) { ; CHECK-LABEL: insertelement_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov w8, #1 // =0x1 ; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.d, x8 ; CHECK-NEXT: fmov d2, #5.00000000 ; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll index e3c4b6f1cb53f..8baa87c6d686d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll @@ -262,8 +262,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @mul_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: mul_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -352,8 +352,8 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @mul_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: mul_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -421,8 +421,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @mul_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: mul_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -490,8 +490,8 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @mul_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: mul_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: mul z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 @@ -746,8 +746,8 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) { define void @abs_v32i8(ptr %a) { ; CHECK-LABEL: abs_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: abs z0.b, p0/m, z0.b ; CHECK-NEXT: abs z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -798,8 +798,8 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) { define void @abs_v16i16(ptr %a) { ; CHECK-LABEL: abs_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: abs z0.h, p0/m, z0.h ; CHECK-NEXT: abs z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -837,8 +837,8 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) { define void @abs_v8i32(ptr %a) { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -876,8 +876,8 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) { define void @abs_v4i64(ptr %a) { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll index 6200e44218a96..73c1eac99dd30 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll @@ -41,8 +41,8 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @icmp_eq_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -91,8 +91,8 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @icmp_eq_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h @@ -141,8 +141,8 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @icmp_eq_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.s, p0/z, z1.s, z0.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s @@ -191,8 +191,8 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @icmp_eq_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_eq_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpeq p1.d, p0/z, z1.d, z0.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d @@ -215,8 +215,8 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) { define void @icmp_ne_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_ne_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpne p1.b, p0/z, z1.b, z0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, z3.b @@ -261,8 +261,8 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) { define void @icmp_sgt_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_sgt_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpgt p1.h, p0/z, z1.h, z0.h ; CHECK-NEXT: cmpgt p0.h, p0/z, z2.h, z3.h @@ -307,8 +307,8 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) { define void @icmp_slt_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: icmp_slt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: cmpgt p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: cmpgt p0.s, p0/z, z3.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll index fcf4f21c6ea84..5158dda37a8b9 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll @@ -86,8 +86,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b @@ -163,18 +163,18 @@ define void @sdiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: sdiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret @@ -203,9 +203,9 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: sdiv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z1.s, z1.h ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: sdiv z0.s, p0/m, z0.s, z1.s @@ -272,8 +272,8 @@ define void @sdiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] @@ -314,8 +314,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @sdiv_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -358,8 +358,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @sdiv_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: sdivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -453,8 +453,8 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h ; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b @@ -530,18 +530,18 @@ define void @udiv_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h ; CHECK-NEXT: udiv z6.s, p0/m, z6.s, z7.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h ; CHECK-NEXT: uzp1 z1.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b +; CHECK-NEXT: uzp1 z3.b, z7.b, z7.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b ; CHECK-NEXT: stp q3, q2, [x0] ; CHECK-NEXT: ret @@ -555,9 +555,9 @@ define void @udiv_v32i8(ptr %a, ptr %b) { define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: udiv_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -570,9 +570,9 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) { define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: udiv_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z1.s, z1.h ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: udiv z0.s, p0/m, z0.s, z1.s @@ -639,8 +639,8 @@ define void @udiv_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udiv z3.s, p0/m, z3.s, z4.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h +; CHECK-NEXT: uzp1 z1.h, z5.h, z5.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z1.h ; CHECK-NEXT: stp q3, q0, [x0] @@ -681,8 +681,8 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @udiv_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -725,8 +725,8 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @udiv_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: udiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: udivr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -743,10 +743,10 @@ define void @udiv_v4i64(ptr %a, ptr %b) { define void @udiv_constantsplat_v8i32(ptr %a) { ; SVE-LABEL: udiv_constantsplat_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: mov w8, #8969 // =0x2309 -; SVE-NEXT: movk w8, #22765, lsl #16 ; SVE-NEXT: ldp q1, q2, [x0] +; SVE-NEXT: movk w8, #22765, lsl #16 +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: mov z0.s, w8 ; SVE-NEXT: movprfx z3, z1 ; SVE-NEXT: umulh z3.s, p0/m, z3.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll index 0785c67ce6f41..f028b3eeca257 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll @@ -221,8 +221,8 @@ define void @ashr_v4i64(ptr %a) { define void @icmp_eq_v32i8(ptr %a) { ; CHECK-LABEL: icmp_eq_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, #7 ; CHECK-NEXT: cmpeq p0.b, p0/z, z1.b, #7 ; CHECK-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff @@ -241,8 +241,8 @@ define void @icmp_eq_v32i8(ptr %a) { define void @icmp_sge_v16i16(ptr %a) { ; CHECK-LABEL: icmp_sge_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: cmpge p1.h, p0/z, z0.h, #15 ; CHECK-NEXT: cmpge p0.h, p0/z, z1.h, #15 ; CHECK-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff @@ -261,8 +261,8 @@ define void @icmp_sge_v16i16(ptr %a) { define void @icmp_sgt_v8i32(ptr %a) { ; CHECK-LABEL: icmp_sgt_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: cmpgt p1.s, p0/z, z0.s, #-8 ; CHECK-NEXT: cmpgt p0.s, p0/z, z1.s, #-8 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff @@ -281,8 +281,8 @@ define void @icmp_sgt_v8i32(ptr %a) { define void @icmp_ult_v4i64(ptr %a) { ; CHECK-LABEL: icmp_ult_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: cmplo p1.d, p0/z, z0.d, #63 ; CHECK-NEXT: cmplo p0.d, p0/z, z1.d, #63 ; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll index d7600c6e6192d..50cf9b73d9a79 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll @@ -37,8 +37,8 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smax_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -81,8 +81,8 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smax_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -125,8 +125,8 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smax_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -171,8 +171,8 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smax_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: smax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -219,8 +219,8 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smin_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -263,8 +263,8 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smin_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -307,8 +307,8 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smin_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -353,8 +353,8 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smin_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: smin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -401,8 +401,8 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umax_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -445,8 +445,8 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umax_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -489,8 +489,8 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umax_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -535,8 +535,8 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umax_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: umax_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -583,8 +583,8 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umin_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -627,8 +627,8 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umin_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -671,8 +671,8 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umin_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -717,8 +717,8 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umin_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: umin_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll index c48cb315a7aa3..cb7fa53eac513 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll @@ -101,8 +101,8 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @smulh_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -214,8 +214,8 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @smulh_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -295,8 +295,8 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @smulh_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -378,8 +378,8 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @smulh_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: smulh_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: smulh z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 @@ -413,9 +413,9 @@ define void @smulh_v4i64(ptr %a, ptr %b) { define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; SVE-LABEL: umulh_v4i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl4 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: ptrue p0.h, vl4 ; SVE-NEXT: and z0.h, z0.h, #0xff ; SVE-NEXT: and z1.h, z1.h, #0xff ; SVE-NEXT: mul z0.h, p0/m, z0.h, z1.h @@ -494,8 +494,8 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @umulh_v32i8(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v32i8: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.b, vl16 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.b, p0/m, z0.b, z1.b ; SVE-NEXT: movprfx z1, z2 @@ -525,9 +525,9 @@ define void @umulh_v32i8(ptr %a, ptr %b) { define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; SVE-LABEL: umulh_v2i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl2 ; SVE-NEXT: // kill: def $d1 killed $d1 def $z1 ; SVE-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE-NEXT: ptrue p0.s, vl2 ; SVE-NEXT: and z0.s, z0.s, #0xffff ; SVE-NEXT: and z1.s, z1.s, #0xffff ; SVE-NEXT: mul z0.s, p0/m, z0.s, z1.s @@ -606,8 +606,8 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @umulh_v16i16(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v16i16: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.h, vl8 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.h, p0/m, z0.h, z1.h ; SVE-NEXT: movprfx z1, z2 @@ -687,8 +687,8 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @umulh_v8i32(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v8i32: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.s, vl4 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.s, p0/m, z0.s, z1.s ; SVE-NEXT: movprfx z1, z2 @@ -770,8 +770,8 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @umulh_v4i64(ptr %a, ptr %b) { ; SVE-LABEL: umulh_v4i64: ; SVE: // %bb.0: -; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q0, q3, [x1] +; SVE-NEXT: ptrue p0.d, vl2 ; SVE-NEXT: ldp q1, q2, [x0] ; SVE-NEXT: umulh z0.d, p0/m, z0.d, z1.d ; SVE-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll index c51630ecd752a..751f43768a511 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll @@ -37,8 +37,8 @@ define i8 @uaddv_v16i8(<16 x i8> %a) { define i8 @uaddv_v32i8(ptr %a) { ; CHECK-LABEL: uaddv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: add z0.b, z1.b, z0.b ; CHECK-NEXT: uaddv d0, p0, z0.b ; CHECK-NEXT: fmov x0, d0 @@ -78,8 +78,8 @@ define i16 @uaddv_v8i16(<8 x i16> %a) { define i16 @uaddv_v16i16(ptr %a) { ; CHECK-LABEL: uaddv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: add z0.h, z1.h, z0.h ; CHECK-NEXT: uaddv d0, p0, z0.h ; CHECK-NEXT: fmov x0, d0 @@ -119,8 +119,8 @@ define i32 @uaddv_v4i32(<4 x i32> %a) { define i32 @uaddv_v8i32(ptr %a) { ; CHECK-LABEL: uaddv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: add z0.s, z1.s, z0.s ; CHECK-NEXT: uaddv d0, p0, z0.s ; CHECK-NEXT: fmov x0, d0 @@ -146,8 +146,8 @@ define i64 @uaddv_v2i64(<2 x i64> %a) { define i64 @uaddv_v4i64(ptr %a) { ; CHECK-LABEL: uaddv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: add z0.d, z1.d, z0.d ; CHECK-NEXT: uaddv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -188,8 +188,8 @@ define i8 @smaxv_v16i8(<16 x i8> %a) { define i8 @smaxv_v32i8(ptr %a) { ; CHECK-LABEL: smaxv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: smaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -226,8 +226,8 @@ define i16 @smaxv_v8i16(<8 x i16> %a) { define i16 @smaxv_v16i16(ptr %a) { ; CHECK-LABEL: smaxv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: smaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -264,8 +264,8 @@ define i32 @smaxv_v4i32(<4 x i32> %a) { define i32 @smaxv_v8i32(ptr %a) { ; CHECK-LABEL: smaxv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: smaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -291,8 +291,8 @@ define i64 @smaxv_v2i64(<2 x i64> %a) { define i64 @smaxv_v4i64(ptr %a) { ; CHECK-LABEL: smaxv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: smax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: smaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -333,8 +333,8 @@ define i8 @sminv_v16i8(<16 x i8> %a) { define i8 @sminv_v32i8(ptr %a) { ; CHECK-LABEL: sminv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: smin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: sminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -371,8 +371,8 @@ define i16 @sminv_v8i16(<8 x i16> %a) { define i16 @sminv_v16i16(ptr %a) { ; CHECK-LABEL: sminv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: smin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: sminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -409,8 +409,8 @@ define i32 @sminv_v4i32(<4 x i32> %a) { define i32 @sminv_v8i32(ptr %a) { ; CHECK-LABEL: sminv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: smin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: sminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -436,8 +436,8 @@ define i64 @sminv_v2i64(<2 x i64> %a) { define i64 @sminv_v4i64(ptr %a) { ; CHECK-LABEL: sminv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: smin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: sminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -478,8 +478,8 @@ define i8 @umaxv_v16i8(<16 x i8> %a) { define i8 @umaxv_v32i8(ptr %a) { ; CHECK-LABEL: umaxv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umax z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: umaxv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -516,8 +516,8 @@ define i16 @umaxv_v8i16(<8 x i16> %a) { define i16 @umaxv_v16i16(ptr %a) { ; CHECK-LABEL: umaxv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umax z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: umaxv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -554,8 +554,8 @@ define i32 @umaxv_v4i32(<4 x i32> %a) { define i32 @umaxv_v8i32(ptr %a) { ; CHECK-LABEL: umaxv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umax z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: umaxv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -581,8 +581,8 @@ define i64 @umaxv_v2i64(<2 x i64> %a) { define i64 @umaxv_v4i64(ptr %a) { ; CHECK-LABEL: umaxv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: umax z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: umaxv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -623,8 +623,8 @@ define i8 @uminv_v16i8(<16 x i8> %a) { define i8 @uminv_v32i8(ptr %a) { ; CHECK-LABEL: uminv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: umin z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: uminv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -661,8 +661,8 @@ define i16 @uminv_v8i16(<8 x i16> %a) { define i16 @uminv_v16i16(ptr %a) { ; CHECK-LABEL: uminv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: umin z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: uminv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -699,8 +699,8 @@ define i32 @uminv_v4i32(<4 x i32> %a) { define i32 @uminv_v8i32(ptr %a) { ; CHECK-LABEL: uminv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: umin z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: uminv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -726,8 +726,8 @@ define i64 @uminv_v2i64(<2 x i64> %a) { define i64 @uminv_v4i64(ptr %a) { ; CHECK-LABEL: uminv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: umin z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: uminv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll index 4a1209b942f4a..d373a9063f852 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll @@ -65,7 +65,6 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.h, z2.b @@ -91,15 +90,16 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: sdivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p1/m, z3.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = srem <16 x i8> %op1, %op2 @@ -112,7 +112,6 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: sunpklo z7.h, z1.b @@ -171,22 +170,23 @@ define void @srem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: sdivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z6.b, z1.b ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -199,11 +199,11 @@ define void @srem_v32i8(ptr %a, ptr %b) { define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: srem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: sunpklo z2.s, z1.h ; CHECK-NEXT: sunpklo z3.s, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sdivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -223,7 +223,6 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z4.s, z0.h -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: sunpklo z2.s, z2.h @@ -235,7 +234,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: mls z0.h, p1/m, z3.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = srem <8 x i16> %op1, %op2 @@ -248,7 +248,6 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q4, q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z5.d, z4.d @@ -277,9 +276,10 @@ define void @srem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z6.h, p0, z6.h, z5.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.h, p1/m, z6.h, z4.h -; CHECK-NEXT: mls z0.h, p1/m, z7.h, z1.h +; CHECK-NEXT: mls z2.h, p0/m, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -322,8 +322,8 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @srem_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: sdiv z4.s, p0/m, z4.s, z0.s @@ -374,8 +374,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @srem_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: srem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: sdiv z4.d, p0/m, z4.d, z0.d @@ -454,7 +454,6 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.h, z2.b @@ -480,15 +479,16 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: udivr z3.s, p0/m, z3.s, z5.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: splice z4.h, p0, z4.h, z2.h +; CHECK-NEXT: uzp1 z5.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h ; CHECK-NEXT: ptrue p0.b, vl8 ; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b ; CHECK-NEXT: splice z3.b, p0, z3.b, z2.b -; CHECK-NEXT: mls z0.b, p1/m, z3.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z3.b, z1.b ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = urem <16 x i8> %op1, %op2 @@ -501,7 +501,6 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q0, [x0, #16] ; CHECK-NEXT: ldr q1, [x1, #16] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: uunpklo z7.h, z1.b @@ -560,22 +559,23 @@ define void @urem_v32i8(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h ; CHECK-NEXT: udivr z18.s, p0/m, z18.s, z20.s ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z5.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z19.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z5.b, z16.b, z16.b ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b ; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: splice z19.h, p0, z19.h, z18.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z6.b, p0, z6.b, z2.b +; CHECK-NEXT: uzp1 z7.b, z19.b, z19.b ; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b -; CHECK-NEXT: mls z0.b, p1/m, z6.b, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: mls z0.b, p0/m, z6.b, z1.b ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.b, p1/m, z7.b, z4.b +; CHECK-NEXT: mls z2.b, p0/m, z7.b, z4.b ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <32 x i8>, ptr %a @@ -588,11 +588,11 @@ define void @urem_v32i8(ptr %a, ptr %b) { define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) { ; CHECK-LABEL: urem_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: uunpklo z2.s, z1.h ; CHECK-NEXT: uunpklo z3.s, z0.h +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: udivr z2.s, p0/m, z2.s, z3.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -612,7 +612,6 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z4.s, z0.h -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: ext z2.b, z2.b, z1.b, #8 ; CHECK-NEXT: ext z3.b, z3.b, z0.b, #8 ; CHECK-NEXT: uunpklo z2.s, z2.h @@ -624,7 +623,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) { ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h -; CHECK-NEXT: mls z0.h, p1/m, z3.h, z1.h +; CHECK-NEXT: ptrue p0.h, vl8 +; CHECK-NEXT: mls z0.h, p0/m, z3.h, z1.h ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 ; CHECK-NEXT: ret %res = urem <8 x i16> %op1, %op2 @@ -637,7 +637,6 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldp q4, q1, [x1] ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0, #16] -; CHECK-NEXT: ptrue p1.h, vl8 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: mov z5.d, z4.d @@ -666,9 +665,10 @@ define void @urem_v16i16(ptr %a, ptr %b) { ; CHECK-NEXT: splice z6.h, p0, z6.h, z5.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h ; CHECK-NEXT: splice z7.h, p0, z7.h, z2.h +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: movprfx z2, z3 -; CHECK-NEXT: mls z2.h, p1/m, z6.h, z4.h -; CHECK-NEXT: mls z0.h, p1/m, z7.h, z1.h +; CHECK-NEXT: mls z2.h, p0/m, z6.h, z4.h +; CHECK-NEXT: mls z0.h, p0/m, z7.h, z1.h ; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret %op1 = load <16 x i16>, ptr %a @@ -711,8 +711,8 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @urem_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: udiv z4.s, p0/m, z4.s, z0.s @@ -763,8 +763,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @urem_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: urem_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: movprfx z4, z1 ; CHECK-NEXT: udiv z4.d, p0/m, z4.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll index 3b58e35bd844c..906112f7ac39e 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll @@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu" define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -23,8 +23,8 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) { define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 @@ -38,8 +38,8 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) { define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { ; CHECK-LABEL: select_v16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z2.b, w0 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: cmpne p0.b, p0/z, z2.b, #0 @@ -53,8 +53,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) { define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: mov z0.b, w2 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x0, #16] @@ -74,8 +74,8 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) { define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -90,8 +90,8 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) { define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -106,8 +106,8 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) { define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { ; CHECK-LABEL: select_v8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z2.h, w0 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: and z2.h, z2.h, #0x1 @@ -122,8 +122,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) { define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mov z0.h, w2 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -144,8 +144,8 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) { define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -160,8 +160,8 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) { define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { ; CHECK-LABEL: select_v4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w0, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.s, w8 @@ -176,8 +176,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) { define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and w8, w2, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mov z0.s, w8 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ldr q0, [x0] @@ -198,9 +198,9 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) { define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -215,9 +215,9 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) { define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { ; CHECK-LABEL: select_v2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -232,9 +232,9 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) { define void @select_v4i64(ptr %a, ptr %b, i1 %mask) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 ; CHECK-NEXT: and x8, x2, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z0.d, x8 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ldr q0, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll index c7fa0e8ad5e4a..9ed52e321d9ab 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll @@ -52,8 +52,8 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @ashr_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -111,8 +111,8 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @ashr_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -155,8 +155,8 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @ashr_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -199,8 +199,8 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @ashr_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: ashr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: asrr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -221,9 +221,9 @@ define void @ashr_v4i64(ptr %a, ptr %b) { define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: lshr_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff ; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h @@ -262,8 +262,8 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @lshr_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -280,9 +280,9 @@ define void @lshr_v32i8(ptr %a, ptr %b) { define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) { ; CHECK-LABEL: lshr_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s @@ -321,8 +321,8 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @lshr_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -365,8 +365,8 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @lshr_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -409,8 +409,8 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @lshr_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: lshr_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lsrr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 @@ -431,8 +431,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) { define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { ; CHECK-LABEL: shl_v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: and z1.s, z1.s, #0xff ; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s @@ -445,8 +445,8 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) { define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) { ; CHECK-LABEL: shl_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h @@ -485,8 +485,8 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) { define void @shl_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.b, p0/m, z0.b, z1.b ; CHECK-NEXT: movprfx z1, z2 @@ -529,8 +529,8 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) { define void @shl_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -573,8 +573,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) { define void @shl_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -617,8 +617,8 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) { define void @shl_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: shl_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: lslr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll index 5c5cf68135bf8..b285659258f31 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll @@ -36,8 +36,8 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) { define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.h ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -55,8 +55,8 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) { define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { ; CHECK-LABEL: ucvtf_v2i16_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 @@ -68,8 +68,8 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) { define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-LABEL: ucvtf_v4i16_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -277,8 +277,8 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) { define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { ; CHECK-LABEL: ucvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -295,8 +295,8 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) { define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v16i32_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: ucvtf z1.h, p0/m, z1.s ; CHECK-NEXT: ucvtf z0.h, p0/m, z0.s @@ -348,8 +348,8 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) { define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -367,8 +367,8 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) { define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-LABEL: ucvtf_v2i32_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: uunpklo z0.d, z0.s ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -446,16 +446,16 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) { define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { ; CHECK-LABEL: ucvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -467,10 +467,9 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) { define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; CHECK-LABEL: ucvtf_v8i64_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z3.s, p0/m, z3.d @@ -482,11 +481,12 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) { ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s ; CHECK-NEXT: splice z1.s, p0, z1.s, z0.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: movprfx z0, z1 -; CHECK-NEXT: fcvt z0.h, p1/m, z1.s +; CHECK-NEXT: fcvt z0.h, p0/m, z1.s ; CHECK-NEXT: movprfx z1, z2 -; CHECK-NEXT: fcvt z1.h, p1/m, z2.s +; CHECK-NEXT: fcvt z1.h, p0/m, z2.s +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z2.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z0.h, z1.h, z1.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z2.h @@ -517,8 +517,8 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) { define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { ; CHECK-LABEL: ucvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -535,8 +535,8 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) { define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v8i64_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d ; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d @@ -576,8 +576,8 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) { define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: ucvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ucvtf z0.d, p0/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] @@ -621,8 +621,8 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) { define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v16i16_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: scvtf z0.h, p0/m, z0.h ; CHECK-NEXT: scvtf z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x1] @@ -652,8 +652,8 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) { define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) { ; CHECK-LABEL: scvtf_v4i16_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: sunpklo z0.s, z0.h ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -850,8 +850,8 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) { define <8 x half> @scvtf_v8i32_v8f16(ptr %a) { ; CHECK-LABEL: scvtf_v8i32_v8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: scvtf z1.h, p0/m, z1.s ; CHECK-NEXT: scvtf z0.h, p0/m, z0.s ; CHECK-NEXT: ptrue p0.h, vl4 @@ -896,8 +896,8 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) { define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v8i32_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: scvtf z0.s, p0/m, z0.s ; CHECK-NEXT: scvtf z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x1] @@ -915,8 +915,8 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) { define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) { ; CHECK-LABEL: scvtf_v2i32_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 @@ -1038,16 +1038,16 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) { define <4 x half> @scvtf_v4i64_v4f16(ptr %a) { ; CHECK-LABEL: scvtf_v4i64_v4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z1.s, p0/m, z1.d ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s -; CHECK-NEXT: fcvt z0.h, p1/m, z0.s +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvt z0.h, p0/m, z0.s ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 ; CHECK-NEXT: ret @@ -1076,8 +1076,8 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) { define <4 x float> @scvtf_v4i64_v4f32(ptr %a) { ; CHECK-LABEL: scvtf_v4i64_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z1.s, p0/m, z1.d ; CHECK-NEXT: scvtf z0.s, p0/m, z0.d ; CHECK-NEXT: ptrue p0.s, vl2 @@ -1110,8 +1110,8 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) { define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: scvtf_v4i64_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: scvtf z0.d, p0/m, z0.d ; CHECK-NEXT: scvtf z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x1] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll index 1809cfcf3db69..81bbaa92d4b47 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll @@ -61,8 +61,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask) define void @select_v32i8(ptr %a, ptr %b) { ; CHECK-LABEL: select_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b ; CHECK-NEXT: cmpeq p0.b, p0/z, z2.b, z3.b @@ -136,8 +136,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) { define void @select_v16i16(ptr %a, ptr %b) { ; CHECK-LABEL: select_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h ; CHECK-NEXT: cmpeq p0.h, p0/z, z2.h, z3.h @@ -193,8 +193,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) { define void @select_v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: select_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s ; CHECK-NEXT: cmpeq p0.s, p0/z, z2.s, z3.s @@ -213,9 +213,9 @@ define void @select_v8i32(ptr %a, ptr %b) { define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) { ; CHECK-LABEL: select_v1i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-NEXT: and x8, x0, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1 ; CHECK-NEXT: mov z2.d, x8 @@ -249,8 +249,8 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) { define void @select_v4i64(ptr %a, ptr %b) { ; CHECK-LABEL: select_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d ; CHECK-NEXT: cmpeq p0.d, p0/z, z2.d, z3.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll index bb1bd8fe72b21..c4aeb4465c537 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll @@ -48,8 +48,8 @@ define i8 @andv_v16i8(<16 x i8> %a) { define i8 @andv_v32i8(ptr %a) { ; CHECK-LABEL: andv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -98,8 +98,8 @@ define i16 @andv_v8i16(<8 x i16> %a) { define i16 @andv_v16i16(ptr %a) { ; CHECK-LABEL: andv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -136,8 +136,8 @@ define i32 @andv_v4i32(<4 x i32> %a) { define i32 @andv_v8i32(ptr %a) { ; CHECK-LABEL: andv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -162,8 +162,8 @@ define i64 @andv_v2i64(<2 x i64> %a) { define i64 @andv_v4i64(ptr %a) { ; CHECK-LABEL: andv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: and z0.d, z1.d, z0.d ; CHECK-NEXT: andv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -216,8 +216,8 @@ define i8 @eorv_v16i8(<16 x i8> %a) { define i8 @eorv_v32i8(ptr %a) { ; CHECK-LABEL: eorv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -266,8 +266,8 @@ define i16 @eorv_v8i16(<8 x i16> %a) { define i16 @eorv_v16i16(ptr %a) { ; CHECK-LABEL: eorv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -304,8 +304,8 @@ define i32 @eorv_v4i32(<4 x i32> %a) { define i32 @eorv_v8i32(ptr %a) { ; CHECK-LABEL: eorv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -330,8 +330,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) { define i64 @eorv_v4i64(ptr %a) { ; CHECK-LABEL: eorv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: eor z0.d, z1.d, z0.d ; CHECK-NEXT: eorv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 @@ -384,8 +384,8 @@ define i8 @orv_v16i8(<16 x i8> %a) { define i8 @orv_v32i8(ptr %a) { ; CHECK-LABEL: orv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv b0, p0, z0.b ; CHECK-NEXT: fmov w0, s0 @@ -434,8 +434,8 @@ define i16 @orv_v8i16(<8 x i16> %a) { define i16 @orv_v16i16(ptr %a) { ; CHECK-LABEL: orv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv h0, p0, z0.h ; CHECK-NEXT: fmov w0, s0 @@ -472,8 +472,8 @@ define i32 @orv_v4i32(<4 x i32> %a) { define i32 @orv_v8i32(ptr %a) { ; CHECK-LABEL: orv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv s0, p0, z0.s ; CHECK-NEXT: fmov w0, s0 @@ -498,8 +498,8 @@ define i64 @orv_v2i64(<2 x i64> %a) { define i64 @orv_v4i64(ptr %a) { ; CHECK-LABEL: orv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q0, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: orr z0.d, z1.d, z0.d ; CHECK-NEXT: orv d0, p0, z0.d ; CHECK-NEXT: fmov x0, d0 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll index e812706744745..f2b3f9b12ea71 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll @@ -123,8 +123,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) { ; CHECK-NEXT: asr z0.b, z0.b, #7 ; CHECK-NEXT: asr z1.b, z1.b, #7 ; CHECK-NEXT: cmpne p1.b, p0/z, z0.b, #0 -; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: cmpne p0.b, p0/z, z1.b, #0 +; CHECK-NEXT: mov z0.b, #0 // =0x0 ; CHECK-NEXT: st1b { z0.b }, p1, [x0, x8] ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: add sp, sp, #32 @@ -204,8 +204,8 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) { ; CHECK-NEXT: asr z0.h, z0.h, #15 ; CHECK-NEXT: asr z1.h, z1.h, #15 ; CHECK-NEXT: cmpne p1.h, p0/z, z1.h, #0 -; CHECK-NEXT: mov z1.h, #0 // =0x0 ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: mov z1.h, #0 // =0x0 ; CHECK-NEXT: st1h { z1.h }, p1, [x0, x8, lsl #1] ; CHECK-NEXT: st1h { z1.h }, p0, [x0] ; CHECK-NEXT: ret @@ -242,7 +242,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: mov z2.s, #0 // =0x0 +; CHECK-NEXT: mov z2.b, z0.b[3] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s3 ; CHECK-NEXT: mov z3.b, z0.b[2] @@ -258,9 +258,9 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: lsl z1.s, z1.s, #31 ; CHECK-NEXT: asr z1.s, z1.s, #31 ; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0 -; CHECK-NEXT: mov z1.b, z0.b[3] -; CHECK-NEXT: st1w { z2.s }, p1, [x0, x8, lsl #2] -; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: mov z1.s, #0 // =0x0 +; CHECK-NEXT: st1w { z1.s }, p1, [x0, x8, lsl #2] +; CHECK-NEXT: fmov w8, s2 ; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s3 @@ -272,7 +272,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) { ; CHECK-NEXT: lsl z0.s, z0.s, #31 ; CHECK-NEXT: asr z0.s, z0.s, #31 ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 -; CHECK-NEXT: st1w { z2.s }, p0, [x0] +; CHECK-NEXT: st1w { z1.s }, p0, [x0] ; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask) @@ -310,8 +310,8 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) { ; CHECK-NEXT: asr z1.d, z1.d, #63 ; CHECK-NEXT: asr z0.d, z0.d, #63 ; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 -; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 +; CHECK-NEXT: mov z0.d, #0 // =0x0 ; CHECK-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll index f0b0b3269e98f..6fcb95f283338 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll @@ -170,8 +170,8 @@ define void @abs_v4i32(ptr %a) { define void @abs_v8i32(ptr %a) { ; CHECK-LABEL: abs_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: abs z0.s, p0/m, z0.s ; CHECK-NEXT: abs z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -199,8 +199,8 @@ define void @abs_v2i64(ptr %a) { define void @abs_v4i64(ptr %a) { ; CHECK-LABEL: abs_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: abs z0.d, p0/m, z0.d ; CHECK-NEXT: abs z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -263,8 +263,8 @@ define void @fadd_v8f16(ptr %a, ptr %b) { define void @fadd_v16f16(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v16f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: movprfx z1, z2 @@ -313,8 +313,8 @@ define void @fadd_v4f32(ptr %a, ptr %b) { define void @fadd_v8f32(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: movprfx z1, z2 @@ -347,8 +347,8 @@ define void @fadd_v2f64(ptr %a, ptr %b) { define void @fadd_v4f64(ptr %a, ptr %b) { ; CHECK-LABEL: fadd_v4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q3, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q1, q2, [x0] ; CHECK-NEXT: fadd z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: movprfx z1, z2 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll index d1bff4fa21a11..00413302798ca 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll @@ -9,8 +9,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @test_revbv16i16(ptr %a) { ; CHECK-LABEL: test_revbv16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -25,8 +25,8 @@ define void @test_revbv16i16(ptr %a) { define void @test_revbv8i32(ptr %a) { ; CHECK-LABEL: test_revbv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -41,8 +41,8 @@ define void @test_revbv8i32(ptr %a) { define void @test_revbv4i64(ptr %a) { ; CHECK-LABEL: test_revbv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -57,8 +57,8 @@ define void @test_revbv4i64(ptr %a) { define void @test_revhv8i32(ptr %a) { ; CHECK-LABEL: test_revhv8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -73,8 +73,8 @@ define void @test_revhv8i32(ptr %a) { define void @test_revhv8f32(ptr %a) { ; CHECK-LABEL: test_revhv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: revh z0.s, p0/m, z0.s ; CHECK-NEXT: revh z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -89,8 +89,8 @@ define void @test_revhv8f32(ptr %a) { define void @test_revhv4i64(ptr %a) { ; CHECK-LABEL: test_revhv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revh z0.d, p0/m, z0.d ; CHECK-NEXT: revh z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -105,8 +105,8 @@ define void @test_revhv4i64(ptr %a) { define void @test_revwv4i64(ptr %a) { ; CHECK-LABEL: test_revwv4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -121,8 +121,8 @@ define void @test_revwv4i64(ptr %a) { define void @test_revwv4f64(ptr %a) { ; CHECK-LABEL: test_revwv4f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -150,8 +150,8 @@ define <16 x i8> @test_revv16i8(ptr %a) { define void @test_revwv8i32v8i32(ptr %a, ptr %b) { ; CHECK-LABEL: test_revwv8i32v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revw z0.d, p0/m, z0.d ; CHECK-NEXT: revw z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -166,8 +166,8 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) { define void @test_revhv32i16(ptr %a) { ; CHECK-LABEL: test_revhv32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: revh z0.d, p0/m, z0.d ; CHECK-NEXT: revh z1.d, p0/m, z1.d @@ -202,8 +202,8 @@ define void @test_rev_elts_fail(ptr %a) { define void @test_revdv4i64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4i64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] @@ -217,8 +217,8 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 { define void @test_revdv4f64_sve2p1(ptr %a) #1 { ; CHECK-LABEL: test_revdv4f64_sve2p1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: revd z0.q, p0/m, z0.q ; CHECK-NEXT: revd z1.q, p0/m, z1.q ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll index d7bfb6b2680e1..cb73030306b02 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll @@ -82,11 +82,11 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: .cfi_def_cfa_offset 64 ; CHECK-NEXT: ldp q1, q3, [x1] ; CHECK-NEXT: ldp q0, q4, [x0] -; CHECK-NEXT: ldp q2, q6, [x0, #32] +; CHECK-NEXT: ldp q2, q5, [x0, #32] ; CHECK-NEXT: mov z16.h, z3.h[7] ; CHECK-NEXT: mov z18.h, z3.h[6] ; CHECK-NEXT: mov z17.h, z4.h[7] -; CHECK-NEXT: ldp q5, q7, [x1, #32] +; CHECK-NEXT: ldp q6, q7, [x1, #32] ; CHECK-NEXT: mov z19.h, z4.h[6] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z16.h, z3.h[5] @@ -98,13 +98,13 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.h, z3.h[4] ; CHECK-NEXT: strh w9, [sp, #28] ; CHECK-NEXT: fmov w9, s19 -; CHECK-NEXT: mov z19.h, z6.h[7] +; CHECK-NEXT: mov z19.h, z5.h[7] ; CHECK-NEXT: zip1 z3.h, z4.h, z3.h ; CHECK-NEXT: strh w8, [sp, #26] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: mov z16.h, z4.h[4] ; CHECK-NEXT: strh w9, [sp, #24] -; CHECK-NEXT: zip1 z4.h, z6.h, z7.h +; CHECK-NEXT: zip1 z4.h, z5.h, z7.h ; CHECK-NEXT: strh w8, [sp, #22] ; CHECK-NEXT: fmov w8, s17 ; CHECK-NEXT: mov z17.h, z1.h[7] @@ -131,7 +131,7 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z0.h[4] ; CHECK-NEXT: zip1 z0.h, z0.h, z1.h -; CHECK-NEXT: zip1 z1.h, z2.h, z5.h +; CHECK-NEXT: zip1 z1.h, z2.h, z6.h ; CHECK-NEXT: strh w8, [sp, #54] ; CHECK-NEXT: fmov w8, s16 ; CHECK-NEXT: ldr q16, [sp, #16] @@ -143,41 +143,41 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: mov z18.h, z7.h[7] ; CHECK-NEXT: strh w8, [sp, #48] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z6.h[6] +; CHECK-NEXT: mov z18.h, z5.h[6] ; CHECK-NEXT: ldr q17, [sp, #48] ; CHECK-NEXT: strh w8, [sp, #46] ; CHECK-NEXT: fmov w8, s19 ; CHECK-NEXT: mov z19.h, z7.h[5] ; CHECK-NEXT: strh w8, [sp, #44] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z6.h[5] +; CHECK-NEXT: mov z20.h, z5.h[5] ; CHECK-NEXT: strh w8, [sp, #42] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z7.h[4] ; CHECK-NEXT: strh w8, [sp, #40] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z6.h[4] +; CHECK-NEXT: mov z19.h, z5.h[4] ; CHECK-NEXT: strh w8, [sp, #38] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[7] +; CHECK-NEXT: mov z20.h, z6.h[7] ; CHECK-NEXT: strh w8, [sp, #36] ; CHECK-NEXT: fmov w8, s18 ; CHECK-NEXT: mov z18.h, z2.h[7] ; CHECK-NEXT: strh w8, [sp, #34] ; CHECK-NEXT: fmov w8, s19 -; CHECK-NEXT: mov z19.h, z5.h[6] +; CHECK-NEXT: mov z19.h, z6.h[6] ; CHECK-NEXT: strh w8, [sp, #32] ; CHECK-NEXT: fmov w8, s20 ; CHECK-NEXT: mov z20.h, z2.h[6] ; CHECK-NEXT: strh w8, [sp, #14] ; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: mov z18.h, z5.h[5] +; CHECK-NEXT: mov z18.h, z6.h[5] ; CHECK-NEXT: strh w8, [sp, #12] ; CHECK-NEXT: fmov w8, s19 ; CHECK-NEXT: mov z19.h, z2.h[5] ; CHECK-NEXT: strh w8, [sp, #10] ; CHECK-NEXT: fmov w8, s20 -; CHECK-NEXT: mov z20.h, z5.h[4] +; CHECK-NEXT: mov z20.h, z6.h[4] ; CHECK-NEXT: fmov w9, s19 ; CHECK-NEXT: strh w8, [sp, #8] ; CHECK-NEXT: fmov w8, s18 @@ -186,10 +186,10 @@ define void @zip_v32i16(ptr %a, ptr %b) { ; CHECK-NEXT: ldr q2, [sp, #32] ; CHECK-NEXT: strh w8, [sp, #6] ; CHECK-NEXT: fmov w8, s20 +; CHECK-NEXT: fmov w9, s18 ; CHECK-NEXT: add z2.h, z16.h, z2.h ; CHECK-NEXT: strh w8, [sp, #2] -; CHECK-NEXT: fmov w8, s18 -; CHECK-NEXT: strh w8, [sp] +; CHECK-NEXT: strh w9, [sp] ; CHECK-NEXT: ldr q4, [sp] ; CHECK-NEXT: stp q3, q2, [x0, #32] ; CHECK-NEXT: add z1.h, z17.h, z4.h @@ -471,9 +471,9 @@ define void @trn_v4f64(ptr %a, ptr %b) { define void @trn_v4f32(ptr %a, ptr %b) { ; CHECK-LABEL: trn_v4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldr q0, [x0] ; CHECK-NEXT: ldr q1, [x1] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: trn1 z2.s, z0.s, z1.s ; CHECK-NEXT: trn2 z0.s, z0.s, z1.s ; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll index f2ba4a7cc3567..ab7c42b3e9e37 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll @@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu" define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q1, q0, [x0, #32] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q2, q3, [x0] ; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 @@ -20,7 +20,6 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ptrue p1.b, vl16 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h @@ -31,7 +30,8 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: splice z1.b, p0, z1.b, z0.b -; CHECK-NEXT: umaxv b0, p1, z1.b +; CHECK-NEXT: ptrue p0.b, vl16 +; CHECK-NEXT: umaxv b0, p0, z1.b ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: and w0, w8, #0x1 ; CHECK-NEXT: ret @@ -45,41 +45,41 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) { define i1 @ptest_or_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_or_v16i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x1] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p3.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z7.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b @@ -112,41 +112,41 @@ declare i1 @llvm.vector.reduce.or.i1.v16i1(<16 x i1>) define i1 @ptest_and_v16i1(ptr %a, ptr %b) { ; CHECK-LABEL: ptest_and_v16i1: ; CHECK: // %bb.0: +; CHECK-NEXT: ldp q1, q0, [x0, #32] ; CHECK-NEXT: ptrue p0.s, vl4 -; CHECK-NEXT: ldp q0, q1, [x0] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: ldp q4, q5, [x1] -; CHECK-NEXT: ldp q6, q7, [x1, #32] -; CHECK-NEXT: fcmne p1.s, p0/z, z3.s, #0.0 -; CHECK-NEXT: fcmne p2.s, p0/z, z2.s, #0.0 -; CHECK-NEXT: fcmne p3.s, p0/z, z1.s, #0.0 -; CHECK-NEXT: fcmne p4.s, p0/z, z0.s, #0.0 -; CHECK-NEXT: fcmne p5.s, p0/z, z7.s, #0.0 -; CHECK-NEXT: fcmne p6.s, p0/z, z6.s, #0.0 -; CHECK-NEXT: fcmne p7.s, p0/z, z5.s, #0.0 -; CHECK-NEXT: fcmne p0.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: ldp q2, q3, [x0] +; CHECK-NEXT: ldp q4, q5, [x1, #32] +; CHECK-NEXT: fcmne p1.s, p0/z, z0.s, #0.0 +; CHECK-NEXT: fcmne p2.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: ldp q0, q1, [x1] +; CHECK-NEXT: fcmne p3.s, p0/z, z3.s, #0.0 +; CHECK-NEXT: fcmne p4.s, p0/z, z2.s, #0.0 +; CHECK-NEXT: fcmne p5.s, p0/z, z5.s, #0.0 +; CHECK-NEXT: fcmne p6.s, p0/z, z4.s, #0.0 +; CHECK-NEXT: fcmne p7.s, p0/z, z1.s, #0.0 +; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0 ; CHECK-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z2.s, p3/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z3.s, p4/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z4.s, p5/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z5.s, p6/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: mov z6.s, p7/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z7.s, p0/z, #-1 // =0xffffffffffffffff -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: splice z1.h, p0, z1.h, z0.h +; CHECK-NEXT: splice z3.h, p0, z3.h, z2.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z4.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z6.h ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: splice z1.h, p1, z1.h, z0.h -; CHECK-NEXT: splice z3.h, p1, z3.h, z2.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z4.h -; CHECK-NEXT: splice z7.h, p1, z7.h, z6.h ; CHECK-NEXT: uzp1 z0.b, z1.b, z1.b ; CHECK-NEXT: uzp1 z1.b, z3.b, z3.b ; CHECK-NEXT: uzp1 z2.b, z5.b, z5.b diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll index f686efff67b66..bfa931044bc53 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll @@ -49,8 +49,8 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) { define void @bitreverse_v32i8(ptr %a) { ; CHECK-LABEL: bitreverse_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: rbit z0.b, p0/m, z0.b ; CHECK-NEXT: rbit z1.b, p0/m, z1.b ; CHECK-NEXT: stp q0, q1, [x0] @@ -101,8 +101,8 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) { define void @bitreverse_v16i16(ptr %a) { ; CHECK-LABEL: bitreverse_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: rbit z0.h, p0/m, z0.h ; CHECK-NEXT: rbit z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -140,8 +140,8 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) { define void @bitreverse_v8i32(ptr %a) { ; CHECK-LABEL: bitreverse_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: rbit z0.s, p0/m, z0.s ; CHECK-NEXT: rbit z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -179,8 +179,8 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) { define void @bitreverse_v4i64(ptr %a) { ; CHECK-LABEL: bitreverse_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: rbit z0.d, p0/m, z0.d ; CHECK-NEXT: rbit z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] @@ -235,8 +235,8 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) { define void @bswap_v16i16(ptr %a) { ; CHECK-LABEL: bswap_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: revb z0.h, p0/m, z0.h ; CHECK-NEXT: revb z1.h, p0/m, z1.h ; CHECK-NEXT: stp q0, q1, [x0] @@ -274,8 +274,8 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) { define void @bswap_v8i32(ptr %a) { ; CHECK-LABEL: bswap_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: revb z0.s, p0/m, z0.s ; CHECK-NEXT: revb z1.s, p0/m, z1.s ; CHECK-NEXT: stp q0, q1, [x0] @@ -313,8 +313,8 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) { define void @bswap_v4i64(ptr %a) { ; CHECK-LABEL: bswap_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: revb z0.d, p0/m, z0.d ; CHECK-NEXT: revb z1.d, p0/m, z1.d ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll index 76bb465774d5b..9dd42e7831e0d 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll @@ -45,8 +45,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) { define void @sdiv_v32i8(ptr %a) { ; CHECK-LABEL: sdiv_v32i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.b, vl16 ; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #5 ; CHECK-NEXT: asrd z1.b, p0/m, z1.b, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -97,8 +97,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) { define void @sdiv_v16i16(ptr %a) { ; CHECK-LABEL: sdiv_v16i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.h, vl8 ; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #5 ; CHECK-NEXT: asrd z1.h, p0/m, z1.h, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -136,8 +136,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) { define void @sdiv_v8i32(ptr %a) { ; CHECK-LABEL: sdiv_v8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #5 ; CHECK-NEXT: asrd z1.s, p0/m, z1.s, #5 ; CHECK-NEXT: stp q0, q1, [x0] @@ -176,8 +176,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) { define void @sdiv_v4i64(ptr %a) { ; CHECK-LABEL: sdiv_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #5 ; CHECK-NEXT: asrd z1.d, p0/m, z1.d, #5 ; CHECK-NEXT: stp q0, q1, [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll index ff1f8699b91af..ad0d4ef0afef3 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll @@ -37,9 +37,9 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2 define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) { ; CHECK-LABEL: interleave_store_legalization: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov z5.d, z2.d ; CHECK-NEXT: // kill: def $q3 killed $q3 def $z2_z3 +; CHECK-NEXT: ptrue p0.s, vl4 ; CHECK-NEXT: mov x8, #8 // =0x8 ; CHECK-NEXT: mov z4.d, z0.d ; CHECK-NEXT: mov z2.d, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll index 367ccbeeea81e..06709ca3685c8 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll @@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu" define void @store_v4i8(ptr %a) { ; CHECK-LABEL: store_v4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: st1b { z0.h }, p0, [x0] ; CHECK-NEXT: ret store <4 x i8> zeroinitializer, ptr %a @@ -49,8 +49,8 @@ define void @store_v32i8(ptr %a) { define void @store_v2i16(ptr %a) { ; CHECK-LABEL: store_v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: mov z0.s, #0 // =0x0 +; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret store <2 x i16> zeroinitializer, ptr %a diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll index 4fef678314019..70219dd30f769 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll @@ -87,51 +87,51 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind { define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v128i16_v128i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.b, vl8 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b -; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.b, z17.b, z17.b -; CHECK-NEXT: uzp1 z19.b, z19.b, z19.b -; CHECK-NEXT: uzp1 z18.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z7.b, z7.b, z7.b +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.b, z17.b, z17.b +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z21.b, z21.b, z21.b +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b +; CHECK-NEXT: uzp1 z3.b, z21.b, z21.b ; CHECK-NEXT: uzp1 z20.b, z20.b, z20.b -; CHECK-NEXT: uzp1 z5.b, z5.b, z5.b -; CHECK-NEXT: uzp1 z7.b, z23.b, z23.b -; CHECK-NEXT: uzp1 z17.b, z22.b, z22.b +; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.b, p0, z16.b, z17.b +; CHECK-NEXT: uzp1 z19.b, z19.b, z19.b +; CHECK-NEXT: uzp1 z18.b, z18.b, z18.b ; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b -; CHECK-NEXT: splice z2.b, p0, z2.b, z3.b -; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: splice z20.b, p0, z20.b, z3.b +; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b +; CHECK-NEXT: splice z6.b, p0, z6.b, z7.b +; CHECK-NEXT: uzp1 z5.b, z22.b, z22.b +; CHECK-NEXT: uzp1 z7.b, z21.b, z21.b +; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b ; CHECK-NEXT: splice z18.b, p0, z18.b, z19.b -; CHECK-NEXT: splice z16.b, p0, z16.b, z1.b -; CHECK-NEXT: add z1.b, z6.b, z6.b -; CHECK-NEXT: splice z20.b, p0, z20.b, z21.b -; CHECK-NEXT: splice z17.b, p0, z17.b, z7.b -; CHECK-NEXT: splice z4.b, p0, z4.b, z5.b -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.b, z2.b, z2.b +; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b +; CHECK-NEXT: add z3.b, z16.b, z16.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b +; CHECK-NEXT: add z1.b, z20.b, z20.b ; CHECK-NEXT: add z5.b, z18.b, z18.b -; CHECK-NEXT: add z0.b, z16.b, z16.b -; CHECK-NEXT: add z3.b, z20.b, z20.b -; CHECK-NEXT: add z1.b, z17.b, z17.b -; CHECK-NEXT: add z4.b, z4.b, z4.b -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.b, z6.b, z6.b +; CHECK-NEXT: add z3.b, z4.b, z4.b +; CHECK-NEXT: add z4.b, z7.b, z7.b +; CHECK-NEXT: add z0.b, z0.b, z0.b +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <128 x i16>, ptr %in %b = trunc <128 x i16> %a to <128 x i8> @@ -226,55 +226,55 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind { define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v64i32_v64i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: ldp q2, q3, [x0, #160] -; CHECK-NEXT: ptrue p1.b, vl8 -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: ldp q16, q17, [x0, #128] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: ldp q16, q17, [x0] +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h ; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h -; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h ; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z21.h, z21.h, z21.h -; CHECK-NEXT: uzp1 z20.h, z20.h, z20.h +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h +; CHECK-NEXT: uzp1 z5.h, z20.h, z20.h +; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z20.h, z23.h, z23.h +; CHECK-NEXT: uzp1 z21.h, z22.h, z22.h ; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z23.h, z23.h, z23.h -; CHECK-NEXT: uzp1 z22.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: splice z20.h, p0, z20.h, z21.h +; CHECK-NEXT: splice z5.h, p0, z5.h, z3.h +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h +; CHECK-NEXT: splice z21.h, p0, z21.h, z20.h ; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: splice z22.h, p0, z22.h, z23.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h ; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: uzp1 z1.b, z2.b, z2.b -; CHECK-NEXT: uzp1 z2.b, z16.b, z16.b -; CHECK-NEXT: uzp1 z5.b, z18.b, z18.b -; CHECK-NEXT: uzp1 z3.b, z20.b, z20.b -; CHECK-NEXT: uzp1 z6.b, z6.b, z6.b -; CHECK-NEXT: uzp1 z7.b, z22.b, z22.b -; CHECK-NEXT: uzp1 z4.b, z4.b, z4.b +; CHECK-NEXT: uzp1 z2.b, z4.b, z4.b +; CHECK-NEXT: ptrue p0.b, vl8 +; CHECK-NEXT: uzp1 z4.b, z18.b, z18.b +; CHECK-NEXT: uzp1 z3.b, z5.b, z5.b +; CHECK-NEXT: uzp1 z7.b, z16.b, z16.b +; CHECK-NEXT: uzp1 z5.b, z21.b, z21.b +; CHECK-NEXT: splice z2.b, p0, z2.b, z1.b +; CHECK-NEXT: uzp1 z1.b, z6.b, z6.b ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b -; CHECK-NEXT: splice z2.b, p1, z2.b, z1.b -; CHECK-NEXT: splice z5.b, p1, z5.b, z3.b -; CHECK-NEXT: splice z6.b, p1, z6.b, z7.b -; CHECK-NEXT: splice z0.b, p1, z0.b, z4.b +; CHECK-NEXT: splice z4.b, p0, z4.b, z3.b +; CHECK-NEXT: splice z7.b, p0, z7.b, z5.b +; CHECK-NEXT: splice z0.b, p0, z0.b, z1.b ; CHECK-NEXT: add z1.b, z2.b, z2.b -; CHECK-NEXT: add z2.b, z5.b, z5.b -; CHECK-NEXT: add z3.b, z6.b, z6.b +; CHECK-NEXT: add z2.b, z4.b, z4.b +; CHECK-NEXT: add z3.b, z7.b, z7.b ; CHECK-NEXT: add z0.b, z0.b, z0.b ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] @@ -368,51 +368,51 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind { define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v64i32_v64i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.h, vl4 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h -; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.h, z3.h, z3.h -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.h, z2.h, z2.h -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.h, z17.h, z17.h -; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h -; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z7.h, z7.h, z7.h +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.h, z17.h, z17.h +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z21.h, z21.h, z21.h +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z21.h, z21.h ; CHECK-NEXT: uzp1 z20.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z5.h, z5.h, z5.h -; CHECK-NEXT: uzp1 z7.h, z23.h, z23.h -; CHECK-NEXT: uzp1 z17.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.h, p0, z16.h, z17.h +; CHECK-NEXT: uzp1 z19.h, z19.h, z19.h +; CHECK-NEXT: uzp1 z18.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h -; CHECK-NEXT: splice z2.h, p0, z2.h, z3.h -; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: splice z20.h, p0, z20.h, z3.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: splice z6.h, p0, z6.h, z7.h +; CHECK-NEXT: uzp1 z5.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z7.h, z21.h, z21.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z1.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z18.h, p0, z18.h, z19.h -; CHECK-NEXT: splice z16.h, p0, z16.h, z1.h -; CHECK-NEXT: add z1.h, z6.h, z6.h -; CHECK-NEXT: splice z20.h, p0, z20.h, z21.h -; CHECK-NEXT: splice z17.h, p0, z17.h, z7.h -; CHECK-NEXT: splice z4.h, p0, z4.h, z5.h -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.h, z2.h, z2.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h +; CHECK-NEXT: add z3.h, z16.h, z16.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h +; CHECK-NEXT: add z1.h, z20.h, z20.h ; CHECK-NEXT: add z5.h, z18.h, z18.h -; CHECK-NEXT: add z0.h, z16.h, z16.h -; CHECK-NEXT: add z3.h, z20.h, z20.h -; CHECK-NEXT: add z1.h, z17.h, z17.h -; CHECK-NEXT: add z4.h, z4.h, z4.h -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.h, z6.h, z6.h +; CHECK-NEXT: add z3.h, z4.h, z4.h +; CHECK-NEXT: add z4.h, z7.h, z7.h +; CHECK-NEXT: add z0.h, z0.h, z0.h +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <64 x i32>, ptr %in %b = trunc <64 x i32> %a to <64 x i16> @@ -535,19 +535,19 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind { ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s -; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s ; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s ; CHECK-NEXT: splice z22.s, p0, z22.s, z23.s +; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s ; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ptrue p0.h, vl4 ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h ; CHECK-NEXT: uzp1 z2.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z3.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h +; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h ; CHECK-NEXT: uzp1 z7.h, z22.h, z22.h +; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h ; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h @@ -658,55 +658,55 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind { define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i64_v32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #64] -; CHECK-NEXT: ptrue p0.s, vl2 ; CHECK-NEXT: ldp q2, q3, [x0, #160] -; CHECK-NEXT: ptrue p1.h, vl4 -; CHECK-NEXT: ldp q4, q5, [x0, #96] -; CHECK-NEXT: ldp q6, q7, [x0] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: ldp q16, q17, [x0, #128] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: ldp q4, q5, [x0, #128] +; CHECK-NEXT: ldp q0, q1, [x0, #64] +; CHECK-NEXT: ldp q6, q7, [x0, #96] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s +; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s +; CHECK-NEXT: ldp q16, q17, [x0] +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: ldp q18, q19, [x0, #192] ; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s +; CHECK-NEXT: ldp q20, q21, [x0, #224] +; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s ; CHECK-NEXT: ldp q22, q23, [x0, #32] -; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s ; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s -; CHECK-NEXT: uzp1 z21.s, z21.s, z21.s -; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s +; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s +; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s +; CHECK-NEXT: uzp1 z5.s, z20.s, z20.s +; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s +; CHECK-NEXT: uzp1 z20.s, z23.s, z23.s +; CHECK-NEXT: uzp1 z21.s, z22.s, z22.s ; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: uzp1 z23.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z22.s, z22.s, z22.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s +; CHECK-NEXT: splice z5.s, p0, z5.s, z3.s +; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s +; CHECK-NEXT: splice z21.s, p0, z21.s, z20.s ; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: splice z22.s, p0, z22.s, z23.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s ; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: uzp1 z1.h, z2.h, z2.h -; CHECK-NEXT: uzp1 z2.h, z16.h, z16.h -; CHECK-NEXT: uzp1 z5.h, z18.h, z18.h -; CHECK-NEXT: uzp1 z3.h, z20.h, z20.h -; CHECK-NEXT: uzp1 z6.h, z6.h, z6.h -; CHECK-NEXT: uzp1 z7.h, z22.h, z22.h -; CHECK-NEXT: uzp1 z4.h, z4.h, z4.h +; CHECK-NEXT: uzp1 z2.h, z4.h, z4.h +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z4.h, z18.h, z18.h +; CHECK-NEXT: uzp1 z3.h, z5.h, z5.h +; CHECK-NEXT: uzp1 z7.h, z16.h, z16.h +; CHECK-NEXT: uzp1 z5.h, z21.h, z21.h +; CHECK-NEXT: splice z2.h, p0, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.h, z6.h, z6.h ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h -; CHECK-NEXT: splice z2.h, p1, z2.h, z1.h -; CHECK-NEXT: splice z5.h, p1, z5.h, z3.h -; CHECK-NEXT: splice z6.h, p1, z6.h, z7.h -; CHECK-NEXT: splice z0.h, p1, z0.h, z4.h +; CHECK-NEXT: splice z4.h, p0, z4.h, z3.h +; CHECK-NEXT: splice z7.h, p0, z7.h, z5.h +; CHECK-NEXT: splice z0.h, p0, z0.h, z1.h ; CHECK-NEXT: add z1.h, z2.h, z2.h -; CHECK-NEXT: add z2.h, z5.h, z5.h -; CHECK-NEXT: add z3.h, z6.h, z6.h +; CHECK-NEXT: add z2.h, z4.h, z4.h +; CHECK-NEXT: add z3.h, z7.h, z7.h ; CHECK-NEXT: add z0.h, z0.h, z0.h ; CHECK-NEXT: stp q1, q2, [x1, #32] ; CHECK-NEXT: stp q3, q0, [x1] @@ -800,51 +800,51 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind { define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind { ; CHECK-LABEL: trunc_v32i64_v32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0, #192] +; CHECK-NEXT: ldp q2, q3, [x0, #192] ; CHECK-NEXT: ptrue p0.s, vl2 -; CHECK-NEXT: ldp q6, q7, [x0, #224] -; CHECK-NEXT: ldp q2, q3, [x0, #32] -; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s -; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s -; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s -; CHECK-NEXT: ldp q4, q5, [x0] +; CHECK-NEXT: ldp q6, q7, [x0, #64] +; CHECK-NEXT: ldp q16, q17, [x0, #224] ; CHECK-NEXT: uzp1 z3.s, z3.s, z3.s -; CHECK-NEXT: ldp q16, q17, [x0, #64] ; CHECK-NEXT: uzp1 z2.s, z2.s, z2.s -; CHECK-NEXT: ldp q18, q19, [x0, #128] -; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s ; CHECK-NEXT: ldp q20, q21, [x0, #160] -; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s -; CHECK-NEXT: ldp q22, q23, [x0, #96] -; CHECK-NEXT: uzp1 z1.s, z17.s, z17.s -; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s -; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s +; CHECK-NEXT: uzp1 z7.s, z7.s, z7.s +; CHECK-NEXT: ldp q0, q1, [x0, #32] +; CHECK-NEXT: uzp1 z17.s, z17.s, z17.s +; CHECK-NEXT: ldp q4, q5, [x0, #96] ; CHECK-NEXT: uzp1 z16.s, z16.s, z16.s -; CHECK-NEXT: uzp1 z21.s, z21.s, z21.s +; CHECK-NEXT: ldp q18, q19, [x0, #128] +; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s +; CHECK-NEXT: uzp1 z3.s, z21.s, z21.s ; CHECK-NEXT: uzp1 z20.s, z20.s, z20.s -; CHECK-NEXT: uzp1 z5.s, z5.s, z5.s -; CHECK-NEXT: uzp1 z7.s, z23.s, z23.s -; CHECK-NEXT: uzp1 z17.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z6.s, z6.s, z6.s +; CHECK-NEXT: ldp q21, q22, [x0] +; CHECK-NEXT: splice z16.s, p0, z16.s, z17.s +; CHECK-NEXT: uzp1 z19.s, z19.s, z19.s +; CHECK-NEXT: uzp1 z18.s, z18.s, z18.s ; CHECK-NEXT: uzp1 z4.s, z4.s, z4.s -; CHECK-NEXT: splice z2.s, p0, z2.s, z3.s -; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: splice z20.s, p0, z20.s, z3.s +; CHECK-NEXT: uzp1 z3.s, z5.s, z5.s +; CHECK-NEXT: splice z6.s, p0, z6.s, z7.s +; CHECK-NEXT: uzp1 z5.s, z22.s, z22.s +; CHECK-NEXT: uzp1 z7.s, z21.s, z21.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z1.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s ; CHECK-NEXT: splice z18.s, p0, z18.s, z19.s -; CHECK-NEXT: splice z16.s, p0, z16.s, z1.s -; CHECK-NEXT: add z1.s, z6.s, z6.s -; CHECK-NEXT: splice z20.s, p0, z20.s, z21.s -; CHECK-NEXT: splice z17.s, p0, z17.s, z7.s -; CHECK-NEXT: splice z4.s, p0, z4.s, z5.s -; CHECK-NEXT: stp q0, q1, [x1, #96] ; CHECK-NEXT: add z2.s, z2.s, z2.s +; CHECK-NEXT: splice z4.s, p0, z4.s, z3.s +; CHECK-NEXT: add z3.s, z16.s, z16.s +; CHECK-NEXT: splice z7.s, p0, z7.s, z5.s +; CHECK-NEXT: splice z0.s, p0, z0.s, z1.s +; CHECK-NEXT: add z1.s, z20.s, z20.s ; CHECK-NEXT: add z5.s, z18.s, z18.s -; CHECK-NEXT: add z0.s, z16.s, z16.s -; CHECK-NEXT: add z3.s, z20.s, z20.s -; CHECK-NEXT: add z1.s, z17.s, z17.s -; CHECK-NEXT: add z4.s, z4.s, z4.s -; CHECK-NEXT: stp q5, q3, [x1, #64] -; CHECK-NEXT: stp q4, q2, [x1] -; CHECK-NEXT: stp q0, q1, [x1, #32] +; CHECK-NEXT: stp q2, q3, [x1, #96] +; CHECK-NEXT: add z2.s, z6.s, z6.s +; CHECK-NEXT: add z3.s, z4.s, z4.s +; CHECK-NEXT: add z4.s, z7.s, z7.s +; CHECK-NEXT: add z0.s, z0.s, z0.s +; CHECK-NEXT: stp q5, q1, [x1, #64] +; CHECK-NEXT: stp q2, q3, [x1, #32] +; CHECK-NEXT: stp q4, q0, [x1] ; CHECK-NEXT: ret %a = load <32 x i64>, ptr %in %b = trunc <32 x i64> %a to <32 x i32> diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll index 92dfc73961362..0ec6538947c73 100644 --- a/llvm/test/CodeGen/AArch64/sve-trunc.ll +++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll @@ -61,8 +61,8 @@ entry: define @trunc_i64toi1( %in) { ; CHECK-LABEL: trunc_i64toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ret entry: @@ -73,9 +73,9 @@ entry: define @trunc_i64toi1_split( %in) { ; CHECK-LABEL: trunc_i64toi1_split: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s @@ -88,11 +88,11 @@ entry: define @trunc_i64toi1_split2( %in) { ; CHECK-LABEL: trunc_i64toi1_split2: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z3.d, z3.d, #0x1 ; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 ; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0 @@ -111,12 +111,12 @@ define @trunc_i64toi1_split3( %in) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG ; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z7.d, z7.d, #0x1 ; CHECK-NEXT: and z6.d, z6.d, #0x1 ; CHECK-NEXT: and z5.d, z5.d, #0x1 @@ -125,23 +125,25 @@ define @trunc_i64toi1_split3( %in) { ; CHECK-NEXT: and z2.d, z2.d, #0x1 ; CHECK-NEXT: and z1.d, z1.d, #0x1 ; CHECK-NEXT: and z0.d, z0.d, #0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: cmpne p1.d, p0/z, z7.d, #0 ; CHECK-NEXT: cmpne p2.d, p0/z, z6.d, #0 ; CHECK-NEXT: cmpne p3.d, p0/z, z5.d, #0 ; CHECK-NEXT: cmpne p4.d, p0/z, z4.d, #0 ; CHECK-NEXT: cmpne p5.d, p0/z, z3.d, #0 ; CHECK-NEXT: cmpne p6.d, p0/z, z2.d, #0 -; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s -; CHECK-NEXT: cmpne p2.d, p0/z, z1.d, #0 +; CHECK-NEXT: cmpne p7.d, p0/z, z1.d, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 -; CHECK-NEXT: uzp1 p3.s, p4.s, p3.s -; CHECK-NEXT: uzp1 p4.s, p6.s, p5.s +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p2.s, p4.s, p3.s +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.s, p6.s, p5.s ; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p0.s, p0.s, p2.s +; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h ; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: uzp1 p1.h, p3.h, p1.h -; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h ; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b ; CHECK-NEXT: addvl sp, sp, #1 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload @@ -155,8 +157,8 @@ entry: define @trunc_i32toi1( %in) { ; CHECK-LABEL: trunc_i32toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: ret entry: @@ -167,8 +169,8 @@ entry: define @trunc_i16toi1( %in) { ; CHECK-LABEL: trunc_i16toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z0.h, z0.h, #0x1 +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 ; CHECK-NEXT: ret entry: @@ -179,8 +181,8 @@ entry: define @trunc_i8toi1( %in) { ; CHECK-LABEL: trunc_i8toi1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: and z0.b, z0.b, #0x1 +; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: cmpne p0.b, p0/z, z0.b, #0 ; CHECK-NEXT: ret entry: @@ -191,8 +193,8 @@ entry: define @trunc_nxv1i32_to_nxv1i1( %in) { ; CHECK-LABEL: trunc_nxv1i32_to_nxv1i1: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z0.s, z0.s, #0x1 +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: punpklo p0.h, p0.b @@ -204,8 +206,8 @@ define @trunc_nxv1i32_to_nxv1i1( %in) { define void @trunc_promoteIntRes( %0, ptr %ptr) { ; CHECK-LABEL: trunc_promoteIntRes: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1h { z0.s }, p0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll index 36d64725742e5..818f37c85ffdb 100644 --- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll +++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll @@ -6,9 +6,9 @@ declare { , } @llvm.umul.with.overflow.nxv2i8 define @umulo_nxv2i8( %x, %y) { ; CHECK-LABEL: umulo_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xff ; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #8 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -26,9 +26,9 @@ declare { , } @llvm.umul.with.overflow.nxv4i8 define @umulo_nxv4i8( %x, %y) { ; CHECK-LABEL: umulo_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xff ; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z1.s, z0.s, #8 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 @@ -46,9 +46,9 @@ declare { , } @llvm.umul.with.overflow.nxv8i8 define @umulo_nxv8i8( %x, %y) { ; CHECK-LABEL: umulo_nxv8i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: and z1.h, z1.h, #0xff ; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: mul z0.h, p0/m, z0.h, z1.h ; CHECK-NEXT: lsr z1.h, z0.h, #8 ; CHECK-NEXT: cmpne p0.h, p0/z, z1.h, #0 @@ -119,9 +119,9 @@ define @umulo_nxv64i8( %x, , } @llvm.umul.with.overflow.nxv2i define @umulo_nxv2i16( %x, %y) { ; CHECK-LABEL: umulo_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffff ; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #16 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -163,9 +163,9 @@ declare { , } @llvm.umul.with.overflow.nxv4i define @umulo_nxv4i16( %x, %y) { ; CHECK-LABEL: umulo_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: and z1.s, z1.s, #0xffff ; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: lsr z1.s, z0.s, #16 ; CHECK-NEXT: cmpne p0.s, p0/z, z1.s, #0 @@ -236,9 +236,9 @@ define @umulo_nxv32i16( %x, , } @llvm.umul.with.overflow.nxv2i define @umulo_nxv2i32( %x, %y) { ; CHECK-LABEL: umulo_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: and z1.d, z1.d, #0xffffffff ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: lsr z1.d, z0.d, #32 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 @@ -333,9 +333,9 @@ define @umulo_nxv16i32( %x, @umulo_nxv8i64( %x, %a, ptr %b) #0 { define void @uzp1_i8_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i8_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: ptrue p0.b, vl128 ; CHECK-NEXT: st1b { z0.b }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -141,8 +141,8 @@ define void @uzp1_i16_valid( %a, ptr %b) #0 { define void @uzp1_i16_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i16_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: ptrue p0.h, vl64 ; CHECK-NEXT: st1h { z0.h }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -168,8 +168,8 @@ define void @uzp1_i32_valid( %a, ptr %b) #0 { define void @uzp1_i32_invalid( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_i32_invalid: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s, vl32 ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to @@ -182,8 +182,8 @@ define void @uzp1_i32_invalid( %a, ptr %b) #0 { define void @uzp1_invalid_all( %a, ptr %b) #0 { ; CHECK-LABEL: uzp1_invalid_all: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret %a.bc = bitcast %a to diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll index 194d1071301d4..91f8f5c2c90d8 100644 --- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll @@ -8,11 +8,11 @@ define i32 @test( %bin.rdx, %bin.rdx2) { ; CHECK-NEXT: sunpklo z5.h, z0.b ; CHECK-NEXT: sunpkhi z0.h, z0.b ; CHECK-NEXT: sunpkhi z2.h, z2.b -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpklo z6.h, z1.b ; CHECK-NEXT: sunpkhi z1.h, z1.b ; CHECK-NEXT: sunpklo z7.h, z3.b ; CHECK-NEXT: sunpkhi z3.h, z3.b +; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: sunpkhi z24.s, z5.h ; CHECK-NEXT: sunpklo z5.s, z5.h ; CHECK-NEXT: sunpklo z25.s, z4.h diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll index 898090340869e..0bdaefdfc2a3f 100644 --- a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll +++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll @@ -80,8 +80,8 @@ define i1 @reduce_and_insert_subvec_into_var( %in, @test_copysign_v4f32_v4f64( %a, ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64: ; CHECK_NO_EXTEND_ROUND: // %bb.0: ; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d +; CHECK_NO_EXTEND_ROUND-NEXT: mov z3.s, #0x7fffffff ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z2.s, p0/m, z2.d ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z1.s, p0/m, z1.d ; CHECK_NO_EXTEND_ROUND-NEXT: uzp1 z1.s, z1.s, z2.s -; CHECK_NO_EXTEND_ROUND-NEXT: mov z2.s, #0x7fffffff -; CHECK_NO_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z2.d +; CHECK_NO_EXTEND_ROUND-NEXT: bsl z0.d, z0.d, z1.d, z3.d ; CHECK_NO_EXTEND_ROUND-NEXT: ret ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64: @@ -107,9 +107,9 @@ declare @llvm.copysign.v2f64( %a, @test_copysign_v4f64_v4f32( %a, %b) #0 { ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_NO_EXTEND_ROUND: // %bb.0: -; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_NO_EXTEND_ROUND-NEXT: uunpkhi z3.d, z2.s ; CHECK_NO_EXTEND_ROUND-NEXT: uunpklo z2.d, z2.s +; CHECK_NO_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_NO_EXTEND_ROUND-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z3.d, p0/m, z3.s ; CHECK_NO_EXTEND_ROUND-NEXT: fcvt z2.d, p0/m, z2.s @@ -119,9 +119,9 @@ define @test_copysign_v4f64_v4f32( %a ; ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32: ; CHECK_EXTEND_ROUND: // %bb.0: -; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_EXTEND_ROUND-NEXT: uunpkhi z3.d, z2.s ; CHECK_EXTEND_ROUND-NEXT: uunpklo z2.d, z2.s +; CHECK_EXTEND_ROUND-NEXT: ptrue p0.d ; CHECK_EXTEND_ROUND-NEXT: mov z4.d, #0x7fffffffffffffff ; CHECK_EXTEND_ROUND-NEXT: fcvt z2.d, p0/m, z2.s ; CHECK_EXTEND_ROUND-NEXT: fcvt z3.d, p0/m, z3.s @@ -176,11 +176,11 @@ define @test_copysign_v4f16_v4f64( %a, @test_copysign_v8f16_v8f32( %a, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_b: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.b ; CHECK-NEXT: rshrnb z1.b, z1.h, #6 ; CHECK-NEXT: rshrnb z0.b, z0.h, #6 +; CHECK-NEXT: ptrue p0.b +; CHECK-NEXT: ld1b { z2.b }, p0/z, [x0, x1] ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b -; CHECK-NEXT: ld1b { z1.b }, p0/z, [x0, x1] -; CHECK-NEXT: add z0.b, z1.b, z0.b +; CHECK-NEXT: add z0.b, z2.b, z0.b ; CHECK-NEXT: st1b { z0.b }, p0, [x0, x1] ; CHECK-NEXT: ret %1 = add %arg1, splat (i16 32) @@ -141,12 +141,12 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_h: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.h ; CHECK-NEXT: rshrnb z1.h, z1.s, #6 ; CHECK-NEXT: rshrnb z0.h, z0.s, #6 +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1h { z2.h }, p0/z, [x0, x1, lsl #1] ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, x1, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h +; CHECK-NEXT: add z0.h, z2.h, z0.h ; CHECK-NEXT: st1h { z0.h }, p0, [x0, x1, lsl #1] ; CHECK-NEXT: ret %1 = add %arg1, splat (i32 32) @@ -162,12 +162,12 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, %arg1){ ; CHECK-LABEL: wide_add_shift_add_rshrnb_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: rshrnb z1.s, z1.d, #32 ; CHECK-NEXT: rshrnb z0.s, z0.d, #32 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, x1, lsl #2] ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, x1, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s +; CHECK-NEXT: add z0.s, z2.s, z0.s ; CHECK-NEXT: st1w { z0.s }, p0, [x0, x1, lsl #2] ; CHECK-NEXT: ret %1 = add %arg1, splat (i64 2147483648) @@ -188,11 +188,11 @@ define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, %arg1, splat (i64 140737488355328) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll index 8bfcd088e1a86..500973d053f5b 100644 --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll @@ -437,8 +437,8 @@ define @uqsub_i32_ptrue_all_h( %a) #0 { define @uqsub_i32_ptrue_all_d( %a) #0 { ; CHECK-LABEL: uqsub_i32_ptrue_all_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov z1.s, #1 // =0x1 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: uqsub z0.s, p0/m, z0.s, z1.s ; CHECK-NEXT: ret %pg.d = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) diff --git a/llvm/test/CodeGen/AArch64/sve2-rsh.ll b/llvm/test/CodeGen/AArch64/sve2-rsh.ll index 516ef3bd581ee..9addd16f89292 100644 --- a/llvm/test/CodeGen/AArch64/sve2-rsh.ll +++ b/llvm/test/CodeGen/AArch64/sve2-rsh.ll @@ -18,8 +18,8 @@ define @neg_urshr_1( %x) { define @neg_urshr_2( %x, %y) { ; CHECK-LABEL: neg_urshr_2: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: add z0.d, z0.d, #32 // =0x20 +; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d ; CHECK-NEXT: ret %add = add nuw nsw %x, splat (i64 32) diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll index e297ade6b9ae1..e5a240b7a53fd 100644 --- a/llvm/test/CodeGen/AArch64/sve2-xar.ll +++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll @@ -152,9 +152,9 @@ define @xar_nxv2i64_l_neg1( %x, , } @sel_x2_i8(target("aarch64.svc ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -28,12 +28,12 @@ define { , } @sel_x2_i16(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -48,12 +48,12 @@ define { , } @sel_x2_f16(target("aarch64. ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -68,12 +68,12 @@ define { , } @sel_x2_bf16(target("aar ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -88,12 +88,12 @@ define { , } @sel_x2_i32(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -108,12 +108,12 @@ define { , } @sel_x2_f32(target("aarch6 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -128,12 +128,12 @@ define { , } @sel_x2_i64(target("aarch64.sv ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 @@ -148,12 +148,12 @@ define { , } @sel_x2_f64(target("aarc ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z5.d, z4.d ; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z4.d, z3.d ; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d } ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload ; CHECK-NEXT: addvl sp, sp, #1 diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll index df504362680ba..3a21eaead5f72 100644 --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll @@ -8,17 +8,17 @@ define { , , , , , , , , , , , , , , , , , , , , , , , , %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -117,9 +117,9 @@ define void @st1_x2_f32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -135,9 +135,9 @@ define void @st1_x2_f64( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -153,8 +153,8 @@ define void @st1_x4_i8( %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -273,8 +273,8 @@ define void @st1_x4_f32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -293,8 +293,8 @@ define void @st1_x4_f64( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -315,9 +315,9 @@ define void @stnt1_x2_i8( %unused, %zn0, %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -351,9 +351,9 @@ define void @stnt1_x2_i32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -369,9 +369,9 @@ define void @stnt1_x2_i64( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -387,9 +387,9 @@ define void @stnt1_x2_f16( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -405,9 +405,9 @@ define void @stnt1_x2_bf16( %unused, %zn ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -423,9 +423,9 @@ define void @stnt1_x2_f32( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -441,9 +441,9 @@ define void @stnt1_x2_f64( %unused, %zn0 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b -; CHECK-NEXT: mov z3.d, z2.d ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] ; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload @@ -459,8 +459,8 @@ define void @stnt1_x4_i8( %unused, %zn0, %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -499,8 +499,8 @@ define void @stnt1_x4_i32( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -519,8 +519,8 @@ define void @stnt1_x4_i64( %unused, %zn0, < ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -539,8 +539,8 @@ define void @stnt1_x4_f16( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -559,8 +559,8 @@ define void @stnt1_x4_bf16( %unused, %zn ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -579,8 +579,8 @@ define void @stnt1_x4_f32( %unused, %zn0, ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d @@ -599,8 +599,8 @@ define void @stnt1_x4_f64( %unused, %zn0 ; CHECK: // %bb.0: ; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: mov p8.b, p0.b ; CHECK-NEXT: mov z6.d, z3.d ; CHECK-NEXT: mov z5.d, z2.d diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll index 30ff70088454d..16521834090b5 100644 --- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -147,11 +147,11 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind { define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-SD-LABEL: v4i8: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: ldr s1, [x0] -; CHECK-SD-NEXT: ldr s2, [x1] -; CHECK-SD-NEXT: movi d0, #0xff00ff00ff00ff -; CHECK-SD-NEXT: uaddl v1.8h, v1.8b, v2.8b -; CHECK-SD-NEXT: umin v0.4h, v1.4h, v0.4h +; CHECK-SD-NEXT: ldr s0, [x0] +; CHECK-SD-NEXT: ldr s1, [x1] +; CHECK-SD-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-SD-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-SD-NEXT: umin v0.4h, v0.4h, v2.4h ; CHECK-SD-NEXT: uzp1 v0.8b, v0.8b, v0.8b ; CHECK-SD-NEXT: str s0, [x2] ; CHECK-SD-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll index b31ce94cdaaea..d5f1febaeb7db 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -52,11 +52,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s -; CHECK-NEXT: shl v1.4s, v0.4s, #31 -; CHECK-NEXT: usra v1.4s, v0.4s, #1 -; CHECK-NEXT: movi v0.16b, #170 -; CHECK-NEXT: fneg v0.4s, v0.4s -; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: movi v1.16b, #170 +; CHECK-NEXT: shl v2.4s, v0.4s, #31 +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: usra v2.4s, v0.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v1.4s, v2.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret %urem = urem <4 x i32> %X, diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 00609b0df9b4e..37c6374215d81 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -100,9 +100,9 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { ; CHECK-NEXT: mov v0.s[3], w3 ; CHECK-NEXT: cmhi v3.4s, v3.4s, v2.4s ; CHECK-NEXT: str d2, [x8, #16] -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: mov w5, v3.s[1] ; CHECK-NEXT: fmov w4, s3 +; CHECK-NEXT: add v1.4s, v0.4s, v1.4s ; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s ; CHECK-NEXT: str q1, [x8] ; CHECK-NEXT: mov w1, v0.s[1] @@ -248,10 +248,10 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-NEXT: eor v2.8b, v0.8b, v1.8b ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: shl v2.4h, v2.4h, #15 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: cmlt v1.4h, v2.4h, #0 +; CHECK-NEXT: shl v1.4h, v2.4h, #15 ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 ; CHECK-NEXT: shl v0.4s, v0.4s, #31 ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: cmlt v0.4s, v0.4s, #0 diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 66ef436f48c63..3254c5ebe9c6b 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -2177,65 +2177,65 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE-LABEL: test_udot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] -; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] ; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] -; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] +; CHECK-GI-BASE-NEXT: ushll v20.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 ; CHECK-GI-BASE-NEXT: ushll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v16.8h, v3.8b, #0 ; CHECK-GI-BASE-NEXT: ushll v5.8h, v1.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: ushll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: ushll v17.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v3.8h, v3.16b, #0 ; CHECK-GI-BASE-NEXT: ushll2 v2.8h, v2.16b, #0 ; CHECK-GI-BASE-NEXT: umull v18.4s, v4.4h, v5.4h ; CHECK-GI-BASE-NEXT: umull2 v4.4s, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: umull2 v19.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v16.4h -; CHECK-GI-BASE-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: ushll v5.8h, v6.8b, #0 -; CHECK-GI-BASE-NEXT: ushll v1.8h, v17.8b, #0 -; CHECK-GI-BASE-NEXT: umull2 v7.4s, v7.8h, v16.8h -; CHECK-GI-BASE-NEXT: ushll2 v6.8h, v6.16b, #0 -; CHECK-GI-BASE-NEXT: ushll2 v17.8h, v17.16b, #0 -; CHECK-GI-BASE-NEXT: addv s16, v18.4s -; CHECK-GI-BASE-NEXT: addv s4, v4.4s -; CHECK-GI-BASE-NEXT: umull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: umull v5.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: umull v19.4s, v16.4h, v17.4h +; CHECK-GI-BASE-NEXT: ushll v1.8h, v7.8b, #0 +; CHECK-GI-BASE-NEXT: umull2 v16.4s, v16.8h, v17.8h +; CHECK-GI-BASE-NEXT: umull v17.4s, v3.4h, v2.4h ; CHECK-GI-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: addv s3, v19.4s -; CHECK-GI-BASE-NEXT: umull v19.4s, v5.4h, v1.4h -; CHECK-GI-BASE-NEXT: umull2 v1.4s, v5.8h, v1.8h -; CHECK-GI-BASE-NEXT: addv s5, v20.4s +; CHECK-GI-BASE-NEXT: ushll2 v7.8h, v7.16b, #0 +; CHECK-GI-BASE-NEXT: addv s18, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: addv s5, v5.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: addv s7, v7.4s -; CHECK-GI-BASE-NEXT: umull v20.4s, v6.4h, v17.4h -; CHECK-GI-BASE-NEXT: umull2 v6.4s, v6.8h, v17.8h -; CHECK-GI-BASE-NEXT: fmov w8, s16 -; CHECK-GI-BASE-NEXT: fmov w9, s4 -; CHECK-GI-BASE-NEXT: fmov w10, s3 -; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s19, v19.4s +; CHECK-GI-BASE-NEXT: umull v3.4s, v1.4h, v20.4h ; CHECK-GI-BASE-NEXT: addv s2, v2.4s -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: umull2 v1.4s, v1.8h, v20.8h +; CHECK-GI-BASE-NEXT: umull v20.4s, v7.4h, v6.4h +; CHECK-GI-BASE-NEXT: fmov w8, s18 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s5 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: fmov w12, s19 +; CHECK-GI-BASE-NEXT: addv s4, v16.4s +; CHECK-GI-BASE-NEXT: addv s5, v17.4s +; CHECK-GI-BASE-NEXT: addv s3, v3.4s +; CHECK-GI-BASE-NEXT: umull2 v0.4s, v7.8h, v6.8h ; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s0 -; CHECK-GI-BASE-NEXT: addv s0, v1.4s -; CHECK-GI-BASE-NEXT: addv s1, v20.4s -; CHECK-GI-BASE-NEXT: addv s5, v6.4s -; CHECK-GI-BASE-NEXT: add w10, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: add w9, w11, w12 +; CHECK-GI-BASE-NEXT: add w8, w8, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 ; CHECK-GI-BASE-NEXT: fmov w12, s2 -; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s7 -; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: addv s4, v20.4s +; CHECK-GI-BASE-NEXT: addv s0, v0.4s +; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: add w10, w11, w12 -; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s3 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w10, s0 -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 ; CHECK-GI-BASE-NEXT: add w0, w8, w9 @@ -2527,65 +2527,65 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) { ; CHECK-GI-BASE-LABEL: test_sdot_v48i8: ; CHECK-GI-BASE: // %bb.0: // %entry ; CHECK-GI-BASE-NEXT: ldp q0, q3, [x1] -; CHECK-GI-BASE-NEXT: ldr q6, [x1, #32] +; CHECK-GI-BASE-NEXT: ldr q6, [x0, #32] ; CHECK-GI-BASE-NEXT: ldp q1, q2, [x0] -; CHECK-GI-BASE-NEXT: ldr q17, [x0, #32] +; CHECK-GI-BASE-NEXT: ldr q7, [x1, #32] +; CHECK-GI-BASE-NEXT: sshll v20.8h, v6.8b, #0 +; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 ; CHECK-GI-BASE-NEXT: sshll v4.8h, v0.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v7.8h, v3.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v16.8h, v3.8b, #0 ; CHECK-GI-BASE-NEXT: sshll v5.8h, v1.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0 -; CHECK-GI-BASE-NEXT: sshll v16.8h, v2.8b, #0 +; CHECK-GI-BASE-NEXT: sshll v17.8h, v2.8b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v3.8h, v3.16b, #0 ; CHECK-GI-BASE-NEXT: sshll2 v2.8h, v2.16b, #0 ; CHECK-GI-BASE-NEXT: smull v18.4s, v4.4h, v5.4h ; CHECK-GI-BASE-NEXT: smull2 v4.4s, v4.8h, v5.8h -; CHECK-GI-BASE-NEXT: smull2 v19.4s, v0.8h, v1.8h -; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v16.4h -; CHECK-GI-BASE-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-GI-BASE-NEXT: sshll v5.8h, v6.8b, #0 -; CHECK-GI-BASE-NEXT: sshll v1.8h, v17.8b, #0 -; CHECK-GI-BASE-NEXT: smull2 v7.4s, v7.8h, v16.8h -; CHECK-GI-BASE-NEXT: sshll2 v6.8h, v6.16b, #0 -; CHECK-GI-BASE-NEXT: sshll2 v17.8h, v17.16b, #0 -; CHECK-GI-BASE-NEXT: addv s16, v18.4s -; CHECK-GI-BASE-NEXT: addv s4, v4.4s -; CHECK-GI-BASE-NEXT: smull v18.4s, v3.4h, v2.4h +; CHECK-GI-BASE-NEXT: smull v5.4s, v0.4h, v1.4h +; CHECK-GI-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h +; CHECK-GI-BASE-NEXT: smull v19.4s, v16.4h, v17.4h +; CHECK-GI-BASE-NEXT: sshll v1.8h, v7.8b, #0 +; CHECK-GI-BASE-NEXT: smull2 v16.4s, v16.8h, v17.8h +; CHECK-GI-BASE-NEXT: smull v17.4s, v3.4h, v2.4h ; CHECK-GI-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h -; CHECK-GI-BASE-NEXT: addv s3, v19.4s -; CHECK-GI-BASE-NEXT: smull v19.4s, v5.4h, v1.4h -; CHECK-GI-BASE-NEXT: smull2 v1.4s, v5.8h, v1.8h -; CHECK-GI-BASE-NEXT: addv s5, v20.4s +; CHECK-GI-BASE-NEXT: sshll2 v7.8h, v7.16b, #0 +; CHECK-GI-BASE-NEXT: addv s18, v18.4s +; CHECK-GI-BASE-NEXT: addv s4, v4.4s +; CHECK-GI-BASE-NEXT: addv s5, v5.4s ; CHECK-GI-BASE-NEXT: addv s0, v0.4s -; CHECK-GI-BASE-NEXT: addv s7, v7.4s -; CHECK-GI-BASE-NEXT: smull v20.4s, v6.4h, v17.4h -; CHECK-GI-BASE-NEXT: smull2 v6.4s, v6.8h, v17.8h -; CHECK-GI-BASE-NEXT: fmov w8, s16 -; CHECK-GI-BASE-NEXT: fmov w9, s4 -; CHECK-GI-BASE-NEXT: fmov w10, s3 -; CHECK-GI-BASE-NEXT: addv s3, v18.4s +; CHECK-GI-BASE-NEXT: addv s19, v19.4s +; CHECK-GI-BASE-NEXT: smull v3.4s, v1.4h, v20.4h ; CHECK-GI-BASE-NEXT: addv s2, v2.4s -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: addv s4, v19.4s +; CHECK-GI-BASE-NEXT: smull2 v1.4s, v1.8h, v20.8h +; CHECK-GI-BASE-NEXT: smull v20.4s, v7.4h, v6.4h +; CHECK-GI-BASE-NEXT: fmov w8, s18 +; CHECK-GI-BASE-NEXT: fmov w9, s4 +; CHECK-GI-BASE-NEXT: fmov w10, s5 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: fmov w12, s19 +; CHECK-GI-BASE-NEXT: addv s4, v16.4s +; CHECK-GI-BASE-NEXT: addv s5, v17.4s +; CHECK-GI-BASE-NEXT: addv s3, v3.4s +; CHECK-GI-BASE-NEXT: smull2 v0.4s, v7.8h, v6.8h ; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s0 -; CHECK-GI-BASE-NEXT: addv s0, v1.4s -; CHECK-GI-BASE-NEXT: addv s1, v20.4s -; CHECK-GI-BASE-NEXT: addv s5, v6.4s -; CHECK-GI-BASE-NEXT: add w10, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w11, s3 +; CHECK-GI-BASE-NEXT: addv s1, v1.4s +; CHECK-GI-BASE-NEXT: add w9, w11, w12 +; CHECK-GI-BASE-NEXT: add w8, w8, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s5 ; CHECK-GI-BASE-NEXT: fmov w12, s2 -; CHECK-GI-BASE-NEXT: add w8, w8, w9 -; CHECK-GI-BASE-NEXT: fmov w9, s7 -; CHECK-GI-BASE-NEXT: add w9, w10, w9 +; CHECK-GI-BASE-NEXT: addv s4, v20.4s +; CHECK-GI-BASE-NEXT: addv s0, v0.4s +; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: add w10, w11, w12 -; CHECK-GI-BASE-NEXT: fmov w11, s4 +; CHECK-GI-BASE-NEXT: fmov w11, s3 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 -; CHECK-GI-BASE-NEXT: fmov w10, s0 -; CHECK-GI-BASE-NEXT: fmov w11, s5 -; CHECK-GI-BASE-NEXT: add w9, w9, w10 ; CHECK-GI-BASE-NEXT: fmov w10, s1 +; CHECK-GI-BASE-NEXT: fmov w11, s0 +; CHECK-GI-BASE-NEXT: add w9, w9, w10 +; CHECK-GI-BASE-NEXT: fmov w10, s4 ; CHECK-GI-BASE-NEXT: add w8, w8, w9 ; CHECK-GI-BASE-NEXT: add w9, w10, w11 ; CHECK-GI-BASE-NEXT: add w0, w8, w9 @@ -2640,13 +2640,13 @@ define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) { ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use: ; CHECK-SD-DOT: // %bb.0: // %entry ; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000 +; CHECK-SD-DOT-NEXT: ushll v3.8h, v0.8b, #0 +; CHECK-SD-DOT-NEXT: ushll v4.8h, v1.8b, #0 ; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b -; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-SD-DOT-NEXT: umull v0.4s, v1.4h, v0.4h -; CHECK-SD-DOT-NEXT: addp v2.2s, v2.2s, v2.2s +; CHECK-SD-DOT-NEXT: umull v0.4s, v4.4h, v3.4h +; CHECK-SD-DOT-NEXT: addp v1.2s, v2.2s, v2.2s ; CHECK-SD-DOT-NEXT: fmov w9, s0 -; CHECK-SD-DOT-NEXT: fmov w8, s2 +; CHECK-SD-DOT-NEXT: fmov w8, s1 ; CHECK-SD-DOT-NEXT: add w0, w8, w9 ; CHECK-SD-DOT-NEXT: ret ; @@ -3534,21 +3534,21 @@ entry: define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) { ; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-SD-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-SD-NEXT: ushll v2.2d, v0.2s, #0 -; CHECK-SD-NEXT: ushll v3.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-SD-NEXT: ushll v2.2d, v1.2s, #0 +; CHECK-SD-NEXT: ushll v3.2d, v0.2s, #0 ; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0 ; CHECK-SD-NEXT: ushll2 v1.2d, v1.4s, #0 -; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: shl v3.2d, v3.2d, #56 +; CHECK-SD-NEXT: shl v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: shl v0.2d, v0.2d, #56 ; CHECK-SD-NEXT: shl v1.2d, v1.2d, #56 -; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56 ; CHECK-SD-NEXT: sshr v3.2d, v3.2d, #56 -; CHECK-SD-NEXT: ssra v2.2d, v0.2d, #56 -; CHECK-SD-NEXT: ssra v3.2d, v1.2d, #56 -; CHECK-SD-NEXT: add v0.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: sshr v2.2d, v2.2d, #56 +; CHECK-SD-NEXT: ssra v3.2d, v0.2d, #56 +; CHECK-SD-NEXT: ssra v2.2d, v1.2d, #56 +; CHECK-SD-NEXT: add v0.2d, v3.2d, v2.2d ; CHECK-SD-NEXT: addp d0, v0.2d ; CHECK-SD-NEXT: fmov x0, d0 ; CHECK-SD-NEXT: ret @@ -3816,37 +3816,37 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) { ; CHECK-SD-NEXT: ldr b1, [sp, #64] ; CHECK-SD-NEXT: add x8, sp, #72 ; CHECK-SD-NEXT: ldr b2, [sp] -; CHECK-SD-NEXT: add x9, sp, #8 +; CHECK-SD-NEXT: add x9, sp, #80 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #80 +; CHECK-SD-NEXT: add x8, sp, #8 ; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 -; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #88 -; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #24 +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #24 ; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #96 -; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-NEXT: add x9, sp, #32 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #96 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 ; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #104 -; CHECK-SD-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #104 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-NEXT: add x9, sp, #112 ; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-NEXT: add x9, sp, #48 -; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #120 -; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-NEXT: mov v0.b[6], w6 ; CHECK-SD-NEXT: mov v0.b[7], w7 @@ -3942,37 +3942,37 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) { ; CHECK-SD-NEXT: ldr b1, [sp, #64] ; CHECK-SD-NEXT: add x8, sp, #72 ; CHECK-SD-NEXT: ldr b2, [sp] -; CHECK-SD-NEXT: add x9, sp, #8 +; CHECK-SD-NEXT: add x9, sp, #80 ; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #80 +; CHECK-SD-NEXT: add x8, sp, #8 ; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-NEXT: add x9, sp, #16 -; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-NEXT: add x8, sp, #88 -; CHECK-SD-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-NEXT: add x9, sp, #24 +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #16 +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #24 ; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-NEXT: add x8, sp, #96 -; CHECK-SD-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-NEXT: add x9, sp, #32 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-NEXT: add x9, sp, #96 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 ; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-NEXT: add x8, sp, #104 -; CHECK-SD-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-NEXT: add x9, sp, #40 -; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-NEXT: add x9, sp, #104 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-NEXT: add x9, sp, #112 ; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-NEXT: add x9, sp, #48 -; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-NEXT: add x8, sp, #120 -; CHECK-SD-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-NEXT: add x9, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: mov v0.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-NEXT: mov v0.b[6], w6 ; CHECK-SD-NEXT: mov v0.b[7], w7 @@ -4069,48 +4069,48 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] ; CHECK-SD-BASE-NEXT: add x8, sp, #72 ; CHECK-SD-BASE-NEXT: ldr b2, [sp] -; CHECK-SD-BASE-NEXT: add x9, sp, #8 +; CHECK-SD-BASE-NEXT: add x9, sp, #80 ; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #80 +; CHECK-SD-BASE-NEXT: add x8, sp, #8 ; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #16 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #88 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #24 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #16 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #88 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #24 ; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #96 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #32 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #96 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #32 ; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #104 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #40 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #112 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #104 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #40 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #112 ; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #48 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #120 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #48 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #120 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-BASE-NEXT: add x8, sp, #56 +; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 ; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-SD-BASE-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 ; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-SD-BASE-NEXT: uaddl2 v3.4s, v0.8h, v1.8h ; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h -; CHECK-SD-BASE-NEXT: ushll v1.8h, v2.8b, #0 -; CHECK-SD-BASE-NEXT: uaddw2 v2.4s, v3.4s, v1.8h -; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v1.4h -; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-BASE-NEXT: uaddw2 v1.4s, v3.4s, v2.8h +; CHECK-SD-BASE-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret @@ -4147,9 +4147,9 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) { ; CHECK-SD-DOT-NEXT: udot v4.2s, v1.8b, v5.8b ; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 ; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s +; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #8 -; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #16 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] @@ -4342,48 +4342,48 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-SD-BASE-NEXT: ldr b1, [sp, #64] ; CHECK-SD-BASE-NEXT: add x8, sp, #72 ; CHECK-SD-BASE-NEXT: ldr b2, [sp] -; CHECK-SD-BASE-NEXT: add x9, sp, #8 +; CHECK-SD-BASE-NEXT: add x9, sp, #80 ; CHECK-SD-BASE-NEXT: ld1 { v1.b }[1], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #80 +; CHECK-SD-BASE-NEXT: add x8, sp, #8 ; CHECK-SD-BASE-NEXT: mov v0.b[1], w1 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #16 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #88 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #24 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[1], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #16 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #88 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #24 ; CHECK-SD-BASE-NEXT: mov v0.b[2], w2 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #96 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #32 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #96 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #32 ; CHECK-SD-BASE-NEXT: mov v0.b[3], w3 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #104 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #40 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #112 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #104 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #40 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[5], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #112 ; CHECK-SD-BASE-NEXT: mov v0.b[4], w4 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-SD-BASE-NEXT: add x9, sp, #48 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x8] -; CHECK-SD-BASE-NEXT: add x8, sp, #120 -; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 -; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-BASE-NEXT: add x8, sp, #48 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[6], [x9] +; CHECK-SD-BASE-NEXT: add x9, sp, #120 +; CHECK-SD-BASE-NEXT: ld1 { v2.b }[6], [x8] ; CHECK-SD-BASE-NEXT: add x8, sp, #56 +; CHECK-SD-BASE-NEXT: mov v0.b[5], w5 +; CHECK-SD-BASE-NEXT: ld1 { v1.b }[7], [x9] ; CHECK-SD-BASE-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-SD-BASE-NEXT: mov v0.b[6], w6 ; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-SD-BASE-NEXT: mov v0.b[7], w7 ; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-SD-BASE-NEXT: saddl2 v3.4s, v0.8h, v1.8h ; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h -; CHECK-SD-BASE-NEXT: sshll v1.8h, v2.8b, #0 -; CHECK-SD-BASE-NEXT: saddw2 v2.4s, v3.4s, v1.8h -; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v1.4h -; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-SD-BASE-NEXT: saddw2 v1.4s, v3.4s, v2.8h +; CHECK-SD-BASE-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-SD-BASE-NEXT: addv s0, v0.4s ; CHECK-SD-BASE-NEXT: fmov w0, s0 ; CHECK-SD-BASE-NEXT: ret @@ -4420,9 +4420,9 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) { ; CHECK-SD-DOT-NEXT: sdot v4.2s, v1.8b, v5.8b ; CHECK-SD-DOT-NEXT: mov v0.b[7], w7 ; CHECK-SD-DOT-NEXT: addp v1.2s, v4.2s, v4.2s +; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[8], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #8 -; CHECK-SD-DOT-NEXT: fmov w9, s1 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[9], [x8] ; CHECK-SD-DOT-NEXT: add x8, sp, #16 ; CHECK-SD-DOT-NEXT: ld1 { v0.b }[10], [x8] @@ -4611,23 +4611,23 @@ entry: define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-SD-BASE-LABEL: full: ; CHECK-SD-BASE: // %bb.0: // %entry -; CHECK-SD-BASE-NEXT: ldr d0, [x2] -; CHECK-SD-BASE-NEXT: ldr d1, [x0] ; CHECK-SD-BASE-NEXT: // kill: def $w3 killed $w3 def $x3 ; CHECK-SD-BASE-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-SD-BASE-NEXT: sxtw x8, w3 ; CHECK-SD-BASE-NEXT: sxtw x9, w1 -; CHECK-SD-BASE-NEXT: uabdl v0.8h, v1.8b, v0.8b -; CHECK-SD-BASE-NEXT: add x11, x2, x8 +; CHECK-SD-BASE-NEXT: ldr d0, [x0] +; CHECK-SD-BASE-NEXT: ldr d1, [x2] ; CHECK-SD-BASE-NEXT: add x10, x0, x9 -; CHECK-SD-BASE-NEXT: ldr d2, [x11] -; CHECK-SD-BASE-NEXT: add x11, x11, x8 +; CHECK-SD-BASE-NEXT: add x11, x2, x8 +; CHECK-SD-BASE-NEXT: uabdl v0.8h, v0.8b, v1.8b ; CHECK-SD-BASE-NEXT: ldr d1, [x10] +; CHECK-SD-BASE-NEXT: ldr d2, [x11] ; CHECK-SD-BASE-NEXT: add x10, x10, x9 -; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h +; CHECK-SD-BASE-NEXT: add x11, x11, x8 ; CHECK-SD-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b ; CHECK-SD-BASE-NEXT: ldr d2, [x11] ; CHECK-SD-BASE-NEXT: add x11, x11, x8 +; CHECK-SD-BASE-NEXT: uaddlp v0.4s, v0.8h ; CHECK-SD-BASE-NEXT: uadalp v0.4s, v1.8h ; CHECK-SD-BASE-NEXT: ldr d1, [x10] ; CHECK-SD-BASE-NEXT: add x10, x10, x9 @@ -4723,98 +4723,98 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1 ; CHECK-GI-NEXT: // kill: def $w3 killed $w3 def $x3 -; CHECK-GI-NEXT: sxtw x8, w1 -; CHECK-GI-NEXT: sxtw x9, w3 +; CHECK-GI-NEXT: sxtw x8, w3 +; CHECK-GI-NEXT: sxtw x9, w1 ; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: ldr d1, [x2] -; CHECK-GI-NEXT: add x10, x0, x8 -; CHECK-GI-NEXT: add x11, x2, x9 +; CHECK-GI-NEXT: add x10, x0, x9 +; CHECK-GI-NEXT: add x11, x2, x8 ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: ldr d2, [x10] -; CHECK-GI-NEXT: ldr d3, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x12, x11, x8 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 +; CHECK-GI-NEXT: ldr d3, [x11] +; CHECK-GI-NEXT: ldr d4, [x10] +; CHECK-GI-NEXT: ldr d5, [x12] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x12, x8 ; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-GI-NEXT: ldr d4, [x10] -; CHECK-GI-NEXT: ldr d5, [x11] -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 +; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 ; CHECK-GI-NEXT: uabdl v6.4s, v0.4h, v1.4h ; CHECK-GI-NEXT: uabdl2 v0.4s, v0.8h, v1.8h ; CHECK-GI-NEXT: ldr d1, [x10] -; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 ; CHECK-GI-NEXT: ldr d7, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: uabdl v16.4s, v2.4h, v3.4h ; CHECK-GI-NEXT: uabdl2 v2.4s, v2.8h, v3.8h -; CHECK-GI-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: uabdl v1.4s, v4.4h, v5.4h +; CHECK-GI-NEXT: uabdl v3.4s, v4.4h, v5.4h ; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v5.8h +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 ; CHECK-GI-NEXT: ldr d5, [x10] -; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s -; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: ldr d17, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-GI-NEXT: uabdl v6.4s, v3.4h, v7.4h -; CHECK-GI-NEXT: uabdl2 v3.4s, v3.8h, v7.8h ; CHECK-GI-NEXT: ushll v5.8h, v5.8b, #0 -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: ushll v7.8h, v16.8b, #0 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: ldr d16, [x10] -; CHECK-GI-NEXT: ldr d17, [x11] -; CHECK-GI-NEXT: add v1.4s, v1.4s, v4.4s -; CHECK-GI-NEXT: add x10, x10, x8 -; CHECK-GI-NEXT: add x11, x11, x9 -; CHECK-GI-NEXT: add v3.4s, v6.4s, v3.4s -; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0 ; CHECK-GI-NEXT: ushll v17.8h, v17.8b, #0 -; CHECK-GI-NEXT: uabdl v22.4s, v5.4h, v7.4h -; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v7.8h +; CHECK-GI-NEXT: add v2.4s, v16.4s, v2.4s +; CHECK-GI-NEXT: add v3.4s, v3.4s, v4.4s +; CHECK-GI-NEXT: uabdl v4.4s, v1.4h, v7.4h +; CHECK-GI-NEXT: uabdl2 v1.4s, v1.8h, v7.8h +; CHECK-GI-NEXT: ldr d7, [x10] +; CHECK-GI-NEXT: ldr d16, [x11] +; CHECK-GI-NEXT: add x10, x10, x9 +; CHECK-GI-NEXT: add x11, x11, x8 ; CHECK-GI-NEXT: ldr d18, [x10] +; CHECK-GI-NEXT: ldr d20, [x10, x9] ; CHECK-GI-NEXT: ldr d19, [x11] -; CHECK-GI-NEXT: addv s0, v0.4s +; CHECK-GI-NEXT: ldr d21, [x11, x8] +; CHECK-GI-NEXT: uabdl v6.4s, v5.4h, v17.4h +; CHECK-GI-NEXT: ushll v7.8h, v7.8b, #0 +; CHECK-GI-NEXT: ushll v16.8h, v16.8b, #0 +; CHECK-GI-NEXT: uabdl2 v5.4s, v5.8h, v17.8h +; CHECK-GI-NEXT: ushll v17.8h, v18.8b, #0 +; CHECK-GI-NEXT: ushll v18.8h, v19.8b, #0 +; CHECK-GI-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-GI-NEXT: ushll v4.8h, v20.8b, #0 +; CHECK-GI-NEXT: ushll v19.8h, v21.8b, #0 ; CHECK-GI-NEXT: addv s2, v2.4s -; CHECK-GI-NEXT: addv s1, v1.4s -; CHECK-GI-NEXT: ushll v18.8h, v18.8b, #0 -; CHECK-GI-NEXT: ushll v19.8h, v19.8b, #0 -; CHECK-GI-NEXT: uabdl v4.4s, v16.4h, v17.4h -; CHECK-GI-NEXT: uabdl2 v16.4s, v16.8h, v17.8h -; CHECK-GI-NEXT: add v5.4s, v22.4s, v5.4s -; CHECK-GI-NEXT: ldr d20, [x10, x8] -; CHECK-GI-NEXT: ldr d21, [x11, x9] +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: addv s3, v3.4s +; CHECK-GI-NEXT: uabdl v20.4s, v7.4h, v16.4h +; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v16.8h +; CHECK-GI-NEXT: add v5.4s, v6.4s, v5.4s +; CHECK-GI-NEXT: uabdl v6.4s, v17.4h, v18.4h +; CHECK-GI-NEXT: uabdl2 v16.4s, v17.8h, v18.8h +; CHECK-GI-NEXT: uabdl v17.4s, v4.4h, v19.4h +; CHECK-GI-NEXT: uabdl2 v4.4s, v4.8h, v19.8h ; CHECK-GI-NEXT: fmov w8, s2 +; CHECK-GI-NEXT: addv s1, v1.4s ; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: ushll v7.8h, v20.8b, #0 -; CHECK-GI-NEXT: ushll v20.8h, v21.8b, #0 -; CHECK-GI-NEXT: uabdl v6.4s, v18.4h, v19.4h -; CHECK-GI-NEXT: uabdl2 v17.4s, v18.8h, v19.8h -; CHECK-GI-NEXT: add v4.4s, v4.4s, v16.4s -; CHECK-GI-NEXT: addv s5, v5.4s -; CHECK-GI-NEXT: fmov w10, s1 +; CHECK-GI-NEXT: fmov w10, s3 +; CHECK-GI-NEXT: add v7.4s, v20.4s, v7.4s +; CHECK-GI-NEXT: add v0.4s, v17.4s, v4.4s +; CHECK-GI-NEXT: addv s4, v5.4s +; CHECK-GI-NEXT: add v2.4s, v6.4s, v16.4s ; CHECK-GI-NEXT: add w8, w8, w9 -; CHECK-GI-NEXT: fmov w9, s3 -; CHECK-GI-NEXT: uabdl v18.4s, v7.4h, v20.4h -; CHECK-GI-NEXT: uabdl2 v7.4s, v7.8h, v20.8h -; CHECK-GI-NEXT: add v6.4s, v6.4s, v17.4s +; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w10, w8 -; CHECK-GI-NEXT: addv s0, v4.4s +; CHECK-GI-NEXT: addv s3, v7.4s +; CHECK-GI-NEXT: addv s1, v2.4s +; CHECK-GI-NEXT: addv s0, v0.4s ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s5 -; CHECK-GI-NEXT: add v7.4s, v18.4s, v7.4s -; CHECK-GI-NEXT: addv s1, v6.4s +; CHECK-GI-NEXT: fmov w9, s4 ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: addv s2, v7.4s +; CHECK-GI-NEXT: fmov w9, s3 ; CHECK-GI-NEXT: add w8, w9, w8 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: add w8, w9, w8 -; CHECK-GI-NEXT: fmov w9, s2 +; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: add w0, w9, w8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll index de26676b5c73e..063b23275c616 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll @@ -264,15 +264,15 @@ define <4 x bfloat> @test_copysign_v4bf16_v4bf16(<4 x bfloat> %a, <4 x bfloat> % define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v4bf16_v4f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.4s v2, #127, msl #8 -; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v2, #1 +; CHECK-NEXT: movi.4s v3, #127, msl #8 ; CHECK-NEXT: ushr.4s v4, v1, #16 -; CHECK-NEXT: add.4s v2, v1, v2 -; CHECK-NEXT: and.16b v3, v4, v3 -; CHECK-NEXT: add.4s v2, v3, v2 -; CHECK-NEXT: fcmeq.4s v3, v1, v1 +; CHECK-NEXT: and.16b v2, v4, v2 +; CHECK-NEXT: add.4s v3, v1, v3 +; CHECK-NEXT: fcmeq.4s v4, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 -; CHECK-NEXT: bit.16b v1, v2, v3 +; CHECK-NEXT: add.4s v2, v2, v3 +; CHECK-NEXT: bit.16b v1, v2, v4 ; CHECK-NEXT: mvni.4h v2, #128, lsl #8 ; CHECK-NEXT: shrn.4h v1, v1, #16 ; CHECK-NEXT: bif.8b v0, v1, v2 @@ -286,16 +286,16 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b ; CHECK-LABEL: test_copysign_v4bf16_v4f64: ; CHECK: ; %bb.0: ; CHECK-NEXT: fcvtxn v1.2s, v1.2d -; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v3, #127, msl #8 ; CHECK-NEXT: fcvtxn2 v1.4s, v2.2d -; CHECK-NEXT: movi.4s v2, #127, msl #8 +; CHECK-NEXT: movi.4s v2, #1 ; CHECK-NEXT: ushr.4s v4, v1, #16 -; CHECK-NEXT: add.4s v2, v1, v2 -; CHECK-NEXT: and.16b v3, v4, v3 -; CHECK-NEXT: add.4s v2, v3, v2 -; CHECK-NEXT: fcmeq.4s v3, v1, v1 +; CHECK-NEXT: add.4s v3, v1, v3 +; CHECK-NEXT: and.16b v2, v4, v2 +; CHECK-NEXT: fcmeq.4s v4, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 -; CHECK-NEXT: bit.16b v1, v2, v3 +; CHECK-NEXT: add.4s v2, v2, v3 +; CHECK-NEXT: bit.16b v1, v2, v4 ; CHECK-NEXT: mvni.4h v2, #128, lsl #8 ; CHECK-NEXT: shrn.4h v1, v1, #16 ; CHECK-NEXT: bif.8b v0, v1, v2 @@ -322,22 +322,22 @@ define <8 x bfloat> @test_copysign_v8bf16_v8bf16(<8 x bfloat> %a, <8 x bfloat> % define <8 x bfloat> @test_copysign_v8bf16_v8f32(<8 x bfloat> %a, <8 x float> %b) #0 { ; CHECK-LABEL: test_copysign_v8bf16_v8f32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi.4s v3, #127, msl #8 -; CHECK-NEXT: movi.4s v4, #1 +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: movi.4s v4, #127, msl #8 ; CHECK-NEXT: ushr.4s v5, v2, #16 ; CHECK-NEXT: ushr.4s v6, v1, #16 -; CHECK-NEXT: add.4s v7, v2, v3 -; CHECK-NEXT: add.4s v3, v1, v3 -; CHECK-NEXT: and.16b v5, v5, v4 -; CHECK-NEXT: and.16b v4, v6, v4 +; CHECK-NEXT: and.16b v5, v5, v3 +; CHECK-NEXT: add.4s v7, v2, v4 +; CHECK-NEXT: and.16b v3, v6, v3 +; CHECK-NEXT: add.4s v4, v1, v4 ; CHECK-NEXT: fcmeq.4s v6, v2, v2 ; CHECK-NEXT: orr.4s v2, #64, lsl #16 ; CHECK-NEXT: add.4s v5, v5, v7 -; CHECK-NEXT: add.4s v3, v4, v3 -; CHECK-NEXT: fcmeq.4s v4, v1, v1 +; CHECK-NEXT: fcmeq.4s v7, v1, v1 ; CHECK-NEXT: orr.4s v1, #64, lsl #16 +; CHECK-NEXT: add.4s v3, v3, v4 ; CHECK-NEXT: bit.16b v2, v5, v6 -; CHECK-NEXT: bit.16b v1, v3, v4 +; CHECK-NEXT: bit.16b v1, v3, v7 ; CHECK-NEXT: uzp2.8h v1, v1, v2 ; CHECK-NEXT: mvni.8h v2, #128, lsl #8 ; CHECK-NEXT: bif.16b v0, v1, v2 diff --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll index 30317dce85e65..c7858416e1796 100644 --- a/llvm/test/CodeGen/AArch64/vector-gep.ll +++ b/llvm/test/CodeGen/AArch64/vector-gep.ll @@ -13,11 +13,11 @@ define <2 x ptr> @vector_gep(<2 x ptr> %0) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: Lloh0: ; CHECK-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff ; CHECK-NEXT: Lloh1: ; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-NEXT: movi v1.2d, #0x000000ffffffff -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret ; CHECK-NEXT: .loh AdrpLdr Lloh0, Lloh1 entry: diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll index a32147eebd759..5e6ff1e0740ce 100644 --- a/llvm/test/CodeGen/AArch64/vselect-constants.ll +++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll @@ -370,10 +370,9 @@ define @signbit_mask_xor_nxv16i8( %a, %a, zeroinitializer %xor = xor %a, %b diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 08ad34c7b03ba..599bd811d7d59 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1697,14 +1697,14 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-NEXT: ldp q18, q16, [x10, #96] ; CHECK-NEXT: uaddw.2d v2, v17, v2 ; CHECK-NEXT: stp q4, q5, [x10, #32] -; CHECK-NEXT: ldp q17, q5, [x10, #64] -; CHECK-NEXT: uaddw2.2d v16, v16, v7 +; CHECK-NEXT: uaddw2.2d v5, v16, v7 +; CHECK-NEXT: ldp q16, q4, [x10, #64] ; CHECK-NEXT: uaddw.2d v7, v18, v7 ; CHECK-NEXT: stp q2, q6, [x10] -; CHECK-NEXT: uaddw2.2d v4, v5, v3 -; CHECK-NEXT: uaddw.2d v3, v17, v3 -; CHECK-NEXT: stp q7, q16, [x10, #96] -; CHECK-NEXT: stp q3, q4, [x10, #64] +; CHECK-NEXT: uaddw2.2d v4, v4, v3 +; CHECK-NEXT: uaddw.2d v2, v16, v3 +; CHECK-NEXT: stp q7, q5, [x10, #96] +; CHECK-NEXT: stp q2, q4, [x10, #64] ; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -1729,15 +1729,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: ld1 { v3.8b }, [x10] ; CHECK-BE-NEXT: add x10, x1, x8 ; CHECK-BE-NEXT: add x8, x8, #128 -; CHECK-BE-NEXT: add x15, x10, #96 ; CHECK-BE-NEXT: add x11, x10, #32 ; CHECK-BE-NEXT: add x14, x10, #64 +; CHECK-BE-NEXT: add x15, x10, #96 ; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b ; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] -; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v1.16b +; CHECK-BE-NEXT: ld1 { v5.2d }, [x10] +; CHECK-BE-NEXT: tbl v6.16b, { v3.16b }, v1.16b ; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v6.2d }, [x10] +; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] ; CHECK-BE-NEXT: ld1 { v19.2d }, [x14] ; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] ; CHECK-BE-NEXT: add x12, x10, #48 @@ -1747,11 +1747,12 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: rev32 v7.8b, v4.8b ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v17.8b, v2.8b -; CHECK-BE-NEXT: ext v18.16b, v5.16b, v5.16b, #8 +; CHECK-BE-NEXT: ext v18.16b, v6.16b, v6.16b, #8 ; CHECK-BE-NEXT: ext v20.16b, v3.16b, v3.16b, #8 ; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-BE-NEXT: rev32 v5.8b, v5.8b +; CHECK-BE-NEXT: rev32 v6.8b, v6.8b ; CHECK-BE-NEXT: rev32 v3.8b, v3.8b +; CHECK-BE-NEXT: ld1 { v22.2d }, [x12] ; CHECK-BE-NEXT: cmp x8, #1024 ; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: uaddw v7.2d, v16.2d, v7.2s @@ -1760,22 +1761,21 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE-NEXT: rev32 v20.8b, v20.8b ; CHECK-BE-NEXT: rev32 v2.8b, v2.8b ; CHECK-BE-NEXT: uaddw v17.2d, v19.2d, v17.2s -; CHECK-BE-NEXT: ld1 { v19.2d }, [x12] -; CHECK-BE-NEXT: uaddw v5.2d, v21.2d, v5.2s -; CHECK-BE-NEXT: ld1 { v21.2d }, [x13] -; CHECK-BE-NEXT: uaddw v3.2d, v6.2d, v3.2s -; CHECK-BE-NEXT: ld1 { v6.2d }, [x17] -; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s +; CHECK-BE-NEXT: ld1 { v19.2d }, [x13] +; CHECK-BE-NEXT: uaddw v6.2d, v21.2d, v6.2s +; CHECK-BE-NEXT: uaddw v3.2d, v5.2d, v3.2s +; CHECK-BE-NEXT: ld1 { v5.2d }, [x17] ; CHECK-BE-NEXT: st1 { v7.2d }, [x15] -; CHECK-BE-NEXT: uaddw v7.2d, v19.2d, v18.2s -; CHECK-BE-NEXT: uaddw v16.2d, v21.2d, v20.2s -; CHECK-BE-NEXT: uaddw v2.2d, v6.2d, v2.2s -; CHECK-BE-NEXT: st1 { v17.2d }, [x14] -; CHECK-BE-NEXT: st1 { v5.2d }, [x11] +; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s +; CHECK-BE-NEXT: st1 { v6.2d }, [x11] +; CHECK-BE-NEXT: uaddw v6.2d, v22.2d, v18.2s ; CHECK-BE-NEXT: st1 { v3.2d }, [x10] +; CHECK-BE-NEXT: uaddw v3.2d, v19.2d, v20.2s +; CHECK-BE-NEXT: uaddw v2.2d, v5.2d, v2.2s +; CHECK-BE-NEXT: st1 { v17.2d }, [x14] ; CHECK-BE-NEXT: st1 { v4.2d }, [x16] -; CHECK-BE-NEXT: st1 { v7.2d }, [x12] -; CHECK-BE-NEXT: st1 { v16.2d }, [x13] +; CHECK-BE-NEXT: st1 { v6.2d }, [x12] +; CHECK-BE-NEXT: st1 { v3.2d }, [x13] ; CHECK-BE-NEXT: st1 { v2.2d }, [x17] ; CHECK-BE-NEXT: b.ne .LBB17_1 ; CHECK-BE-NEXT: // %bb.2: // %exit diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s index 73eadc268bf26..f703392f3e9d0 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s @@ -1070,14 +1070,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 4 0.50 abs d29, d24 -# CHECK-NEXT: 1 4 0.50 abs v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 abs v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 abs v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 abs v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 abs v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 abs v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 abs v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 abs d29, d24 +# CHECK-NEXT: 1 3 0.50 abs v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 abs v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 abs v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 abs v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 abs v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 abs v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 abs v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 add d17, d31, d29 # CHECK-NEXT: 1 3 0.50 add v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 addhn v0.2s, v0.2d, v0.2d @@ -1086,8 +1086,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 addhn2 v0.16b, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 addhn2 v0.4s, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 addhn2 v0.8h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 addp v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 addp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 addp v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 addp v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 and v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 bic v0.4h, #15, lsl #8 # CHECK-NEXT: 1 3 0.50 bic v0.8b, v0.8b, v0.8b @@ -1441,13 +1441,13 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 mvni v0.2s, #0 # CHECK-NEXT: 1 3 0.50 mvni v0.4s, #16, msl #16 # CHECK-NEXT: 1 3 0.50 neg d29, d24 -# CHECK-NEXT: 1 4 0.50 neg v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 neg v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 neg v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 neg v0.2d, v0.2d # CHECK-NEXT: 1 3 0.50 neg v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 neg v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 neg v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 neg v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 neg v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 neg v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 neg v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 mvn v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 mvn v0.8b, v0.8b # CHECK-NEXT: 1 3 0.50 orn v0.16b, v0.16b, v0.16b @@ -1457,12 +1457,12 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 pmul v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 pmull v0.8h, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 pmull2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 raddhn v0.2s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 raddhn v0.4h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 raddhn v0.8b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.16b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.4s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 raddhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 raddhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 raddhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 raddhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 raddhn2 v0.8h, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 rbit v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 rbit v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 rev16 v21.8b, v1.8b @@ -1483,19 +1483,19 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 rshrn2 v0.16b, v0.8h, #3 # CHECK-NEXT: 1 4 0.50 rshrn2 v0.4s, v0.2d, #3 # CHECK-NEXT: 1 4 0.50 rshrn2 v0.8h, v0.4s, #3 -# CHECK-NEXT: 1 4 0.50 rsubhn v0.2s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 rsubhn v0.4h, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 rsubhn v0.8b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.16b, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.4s, v0.2d, v0.2d -# CHECK-NEXT: 1 4 0.50 rsubhn2 v0.8h, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 saba v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 8 0.50 sabal v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 8 0.50 sabal v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 8 0.50 sabal v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 sabal2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 sabal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 sabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 8 0.50 rsubhn v0.2s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 rsubhn v0.4h, v0.4s, v0.4s +# CHECK-NEXT: 1 8 0.50 rsubhn v0.8b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.16b, v0.8h, v0.8h +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.4s, v0.2d, v0.2d +# CHECK-NEXT: 1 8 0.50 rsubhn2 v0.8h, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 saba v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 6 0.50 sabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 6 0.50 sabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 6 0.50 sabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 sabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 sabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 6 0.50 sabal2 v0.8h, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 sabd v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 3 0.50 sabdl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 sabdl v0.4s, v0.4h, v0.4h @@ -1503,30 +1503,30 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sabdl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 sabdl2 v0.4s, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sabdl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 8 1.00 sadalp v0.1d, v0.2s -# CHECK-NEXT: 1 8 1.00 sadalp v0.2d, v0.4s -# CHECK-NEXT: 1 8 1.00 sadalp v0.2s, v0.4h -# CHECK-NEXT: 1 8 1.00 sadalp v0.4h, v0.8b -# CHECK-NEXT: 1 8 1.00 sadalp v0.4s, v0.8h -# CHECK-NEXT: 1 8 1.00 sadalp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 saddl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 saddl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 saddl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 saddl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 saddl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 saddl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 saddlp v0.1d, v0.2s -# CHECK-NEXT: 1 4 0.50 saddlp v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 saddlp v0.2s, v0.4h -# CHECK-NEXT: 1 4 0.50 saddlp v0.4h, v0.8b -# CHECK-NEXT: 1 4 0.50 saddlp v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 saddlp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 saddw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 saddw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 saddw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 saddw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 saddw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 saddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 7 1.00 sadalp v0.1d, v0.2s +# CHECK-NEXT: 1 7 1.00 sadalp v0.2d, v0.4s +# CHECK-NEXT: 1 7 1.00 sadalp v0.2s, v0.4h +# CHECK-NEXT: 1 7 1.00 sadalp v0.4h, v0.8b +# CHECK-NEXT: 1 7 1.00 sadalp v0.4s, v0.8h +# CHECK-NEXT: 1 7 1.00 sadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 saddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 saddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 saddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 saddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 saddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 saddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 saddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddlp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 saddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 saddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 saddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 saddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 saddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 saddw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 scvtf d21, d12 # CHECK-NEXT: 1 4 0.50 scvtf d21, d12, #64 # CHECK-NEXT: 1 4 0.50 scvtf s22, s13 @@ -1573,18 +1573,18 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 sli v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 sli v0.8b, v0.8b, #3 # CHECK-NEXT: 1 4 0.50 sli v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 smax v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 smax v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 smax v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 smaxp v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 smaxp v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 smaxp v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 smin v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smax v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smax v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smax v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smaxp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 smaxp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 smaxp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 smin v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 smin v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 smin v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 sminp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 smin v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sminp v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 sminp v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 sminp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 sminp v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 smlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 smlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 smlal v0.8h, v0.8b, v0.8b @@ -1777,14 +1777,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 srshr v0.4s, v0.4s, #3 # CHECK-NEXT: 1 3 0.50 srshr v0.8b, v0.8b, #3 # CHECK-NEXT: 1 3 0.50 srshr v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 8 1.00 srsra d15, d11, #19 -# CHECK-NEXT: 1 8 1.00 srsra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 srsra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 7 1.00 srsra d15, d11, #19 +# CHECK-NEXT: 1 7 1.00 srsra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 7 1.00 srsra v0.8h, v0.8h, #3 # CHECK-NEXT: 1 3 0.50 sshl d31, d31, d31 # CHECK-NEXT: 1 3 0.50 sshl v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 3 0.50 sshl v0.2s, v0.2s, v0.2s @@ -1800,26 +1800,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 sshr v0.4s, v0.4s, #3 # CHECK-NEXT: 1 3 0.50 sshr v0.8b, v0.8b, #3 # CHECK-NEXT: 1 3 0.50 sshr v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 8 1.00 ssra d18, d12, #21 -# CHECK-NEXT: 1 8 1.00 ssra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 ssra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 ssubl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 ssubl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 ssubl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 ssubl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 ssubw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 ssubw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 ssubw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 ssubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 ssra d18, d12, #21 +# CHECK-NEXT: 1 3 0.50 ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 ssubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 ssubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 ssubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 ssubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 ssubw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 1.00 * st1 { v0.16b }, [x0] # CHECK-NEXT: 2 5 2.00 * st1 { v0.2d, v1.2d, v2.2d }, [x0], #48 # CHECK-NEXT: 1 5 4.00 * st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x0] @@ -1843,7 +1843,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0] # CHECK-NEXT: 2 5 2.00 * st4 { v0.b, v1.b, v2.b, v3.b }[9], [x0], x5 # CHECK-NEXT: 1 3 0.50 sub d15, d5, d16 -# CHECK-NEXT: 1 4 0.50 sub v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1 3 0.50 sub v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 suqadd b19, b14 # CHECK-NEXT: 1 4 0.50 suqadd d18, d22 # CHECK-NEXT: 1 4 0.50 suqadd h20, h15 @@ -1885,13 +1885,13 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 trn2 v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 trn2 v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 trn2 v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 uaba v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 uabal v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 8 0.50 uabal v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 8 0.50 uabal v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 8 0.50 uabal2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 8 0.50 uabal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 8 0.50 uabal2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 6 0.50 uaba v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 uabal v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 6 0.50 uabal v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 6 0.50 uabal v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 6 0.50 uabal2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 6 0.50 uabal2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 6 0.50 uabal2 v0.8h, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 uabd v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 3 0.50 uabdl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 3 0.50 uabdl v0.4s, v0.4h, v0.4h @@ -1899,30 +1899,30 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 uabdl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 uabdl2 v0.4s, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 uabdl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 8 1.00 uadalp v0.1d, v0.2s -# CHECK-NEXT: 1 8 1.00 uadalp v0.2d, v0.4s -# CHECK-NEXT: 1 8 1.00 uadalp v0.2s, v0.4h -# CHECK-NEXT: 1 8 1.00 uadalp v0.4h, v0.8b -# CHECK-NEXT: 1 8 1.00 uadalp v0.4s, v0.8h -# CHECK-NEXT: 1 8 1.00 uadalp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddlp v0.1d, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddlp v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddlp v0.2s, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddlp v0.4h, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddlp v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddlp v0.8h, v0.16b -# CHECK-NEXT: 1 4 0.50 uaddw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 uaddw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 uaddw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 uaddw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 7 1.00 uadalp v0.1d, v0.2s +# CHECK-NEXT: 1 7 1.00 uadalp v0.2d, v0.4s +# CHECK-NEXT: 1 7 1.00 uadalp v0.2s, v0.4h +# CHECK-NEXT: 1 7 1.00 uadalp v0.4h, v0.8b +# CHECK-NEXT: 1 7 1.00 uadalp v0.4s, v0.8h +# CHECK-NEXT: 1 7 1.00 uadalp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.1d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddlp v0.2s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddlp v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddlp v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 uaddw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 uaddw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 uaddw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 uaddw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 ucvtf d21, d14 # CHECK-NEXT: 1 4 0.50 ucvtf d21, d14, #64 # CHECK-NEXT: 1 4 0.50 ucvtf s22, s13 @@ -1935,21 +1935,21 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 ucvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 ucvtf v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uhadd v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 uhadd v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 uhsub v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umax v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 uhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 uhsub v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 umax v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 umax v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umax v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 umaxp v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 umax v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umaxp v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 umaxp v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 umaxp v0.8h, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 umin v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 umin v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 umin v0.8b, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 uminp v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 uminp v0.4h, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 uminp v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 umaxp v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 umin v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 umin v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 umin v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 uminp v0.2s, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 uminp v0.4h, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 uminp v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 umlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 umlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 umlal v0.8h, v0.8b, v0.8b @@ -2024,9 +2024,9 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 uqxtn2 v0.8h, v0.4s # CHECK-NEXT: 1 4 0.50 urecpe v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 urecpe v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 urhadd v0.16b, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 urhadd v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 urhadd v0.8h, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 urhadd v0.16b, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 urhadd v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 urhadd v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 urshl d8, d7, d4 # CHECK-NEXT: 1 3 0.50 urshl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 urshl v0.2d, v0.2d, v0.2d @@ -2042,14 +2042,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 3 0.50 urshr v0.8h, v0.8h, #3 # CHECK-NEXT: 1 12 9.00 ursqrte v0.2s, v0.2s # CHECK-NEXT: 1 12 9.00 ursqrte v0.4s, v0.4s -# CHECK-NEXT: 1 8 1.00 ursra d18, d10, #13 -# CHECK-NEXT: 1 8 1.00 ursra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 ursra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 7 1.00 ursra d18, d10, #13 +# CHECK-NEXT: 1 7 1.00 ursra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 7 1.00 ursra v0.8h, v0.8h, #3 # CHECK-NEXT: 1 3 0.50 ushl d0, d0, d0 # CHECK-NEXT: 1 3 0.50 ushl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 3 0.50 ushl v0.4s, v0.4s, v0.4s @@ -2075,26 +2075,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 usqadd v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 usqadd v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 usqadd v0.8h, v0.8h -# CHECK-NEXT: 1 8 1.00 usra d20, d13, #61 -# CHECK-NEXT: 1 8 1.00 usra v0.16b, v0.16b, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.2d, v0.2d, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.2s, v0.2s, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.4h, v0.4h, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.4s, v0.4s, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.8b, v0.8b, #3 -# CHECK-NEXT: 1 8 1.00 usra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 4 0.50 usubl v0.2d, v0.2s, v0.2s -# CHECK-NEXT: 1 4 0.50 usubl v0.4s, v0.4h, v0.4h -# CHECK-NEXT: 1 4 0.50 usubl v0.8h, v0.8b, v0.8b -# CHECK-NEXT: 1 4 0.50 usubl2 v0.2d, v0.4s, v0.4s -# CHECK-NEXT: 1 4 0.50 usubl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 4 0.50 usubl2 v0.8h, v0.16b, v0.16b -# CHECK-NEXT: 1 4 0.50 usubw v0.2d, v0.2d, v0.2s -# CHECK-NEXT: 1 4 0.50 usubw v0.4s, v0.4s, v0.4h -# CHECK-NEXT: 1 4 0.50 usubw v0.8h, v0.8h, v0.8b -# CHECK-NEXT: 1 4 0.50 usubw2 v0.2d, v0.2d, v0.4s -# CHECK-NEXT: 1 4 0.50 usubw2 v0.4s, v0.4s, v0.8h -# CHECK-NEXT: 1 4 0.50 usubw2 v0.8h, v0.8h, v0.16b +# CHECK-NEXT: 1 3 0.50 usra d20, d13, #61 +# CHECK-NEXT: 1 3 0.50 usra v0.16b, v0.16b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2d, v0.2d, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.2s, v0.2s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4h, v0.4h, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.4s, v0.4s, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8b, v0.8b, #3 +# CHECK-NEXT: 1 3 0.50 usra v0.8h, v0.8h, #3 +# CHECK-NEXT: 1 3 0.50 usubl v0.2d, v0.2s, v0.2s +# CHECK-NEXT: 1 3 0.50 usubl v0.4s, v0.4h, v0.4h +# CHECK-NEXT: 1 3 0.50 usubl v0.8h, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.50 usubl2 v0.2d, v0.4s, v0.4s +# CHECK-NEXT: 1 3 0.50 usubl2 v0.4s, v0.8h, v0.8h +# CHECK-NEXT: 1 3 0.50 usubl2 v0.8h, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.50 usubw v0.2d, v0.2d, v0.2s +# CHECK-NEXT: 1 3 0.50 usubw v0.4s, v0.4s, v0.4h +# CHECK-NEXT: 1 3 0.50 usubw v0.8h, v0.8h, v0.8b +# CHECK-NEXT: 1 3 0.50 usubw2 v0.2d, v0.2d, v0.4s +# CHECK-NEXT: 1 3 0.50 usubw2 v0.4s, v0.4s, v0.8h +# CHECK-NEXT: 1 3 0.50 usubw2 v0.8h, v0.8h, v0.16b # CHECK-NEXT: 1 4 0.50 uzp1 v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 4 0.50 uzp1 v0.2d, v0.2d, v0.2d # CHECK-NEXT: 1 4 0.50 uzp1 v0.2s, v0.2s, v0.2s @@ -2148,7 +2148,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] -# CHECK-NEXT: - - - - - 39.00 91.00 - - 509.00 509.00 3.00 3.00 197.00 +# CHECK-NEXT: - - - - - 39.00 91.00 - - 501.00 501.00 3.00 3.00 197.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] Instructions: @@ -2882,14 +2882,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.4s, v0.4s, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.8b, v0.8b, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - sshr v0.8h, v0.8h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra d18, d12, #21 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.16b, v0.16b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.2d, v0.2d, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.2s, v0.2s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.4h, v0.4h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.4s, v0.4s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.8b, v0.8b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - ssra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra d18, d12, #21 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssra v0.8h, v0.8h, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - ssubl v0.8h, v0.8b, v0.8b @@ -3157,14 +3157,14 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usqadd v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra d20, d13, #61 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.16b, v0.16b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.2d, v0.2d, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.2s, v0.2s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.4h, v0.4h, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.4s, v0.4s, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.8b, v0.8b, #3 -# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - usra v0.8h, v0.8h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra d20, d13, #61 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.16b, v0.16b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.2d, v0.2d, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.2s, v0.2s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.4h, v0.4h, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.4s, v0.4s, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.8b, v0.8b, #3 +# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usra v0.8h, v0.8h, #3 # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - usubl v0.8h, v0.8b, v0.8b diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s index a8fb8b669838f..d8051e7ecb4fe 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s @@ -3476,12 +3476,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 add z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, #65280 # CHECK-NEXT: 1 3 0.50 add z31.s, z31.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 addhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 addhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 addhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 addhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 addhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 addhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 addhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 addhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 addhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 addhnt z0.s, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 addp z0.b, p0/m, z0.b, z1.b # CHECK-NEXT: 1 3 0.50 addp z0.h, p0/m, z0.h, z1.h # CHECK-NEXT: 1 3 0.50 addp z29.s, p7/m, z29.s, z30.s @@ -3516,7 +3516,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 aesimc z31.b, z31.b # CHECK-NEXT: 1 3 0.50 aesmc z0.b, z0.b # CHECK-NEXT: 1 3 0.50 aesmc z31.b, z31.b -# CHECK-NEXT: 1 6 1.00 and p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 and p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 and z0.d, z0.d, z0.d @@ -3531,7 +3531,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 and z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 and z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 ands p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 ands p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 1.00 andv b0, p7, z31.b # CHECK-NEXT: 1 4 1.00 andv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 andv h0, p7, z31.h @@ -3574,7 +3574,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 asrr z0.d, p0/m, z0.d, z0.d # CHECK-NEXT: 1 3 0.50 asrr z0.h, p0/m, z0.h, z0.h # CHECK-NEXT: 1 3 0.50 asrr z0.s, p0/m, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 bcax z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 4 0.50 bcax z29.d, z29.d, z30.d, z31.d # CHECK-NEXT: 1 14 13.00 bdep z0.b, z1.b, z31.b # CHECK-NEXT: 1 70 69.00 bdep z0.d, z1.d, z31.d # CHECK-NEXT: 1 22 21.00 bdep z0.h, z1.h, z31.h @@ -3603,34 +3603,34 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 70 69.00 bgrp z0.d, z1.d, z31.d # CHECK-NEXT: 1 22 21.00 bgrp z0.h, z1.h, z31.h # CHECK-NEXT: 1 38 37.00 bgrp z0.s, z1.s, z31.s -# CHECK-NEXT: 1 6 1.00 bic p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 bic p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 bic p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bic p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 bic z0.d, z0.d, z0.d # CHECK-NEXT: 1 3 0.50 bic z23.d, z13.d, z8.d # CHECK-NEXT: 1 3 0.50 bic z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 bic z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 bic z31.h, p7/m, z31.h, z31.h # CHECK-NEXT: 1 3 0.50 bic z31.s, p7/m, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 bics p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 bics p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brka p0.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 brka p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkas p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkb p0.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 brkb p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkbs p0.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 brkn p0.b, p15/z, p1.b, p0.b -# CHECK-NEXT: 1 6 1.00 brkn p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkns p0.b, p15/z, p1.b, p0.b -# CHECK-NEXT: 1 6 1.00 brkns p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpa p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpa p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpas p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpas p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpb p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpb p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 brkpbs p0.b, p15/z, p1.b, p2.b -# CHECK-NEXT: 1 6 1.00 brkpbs p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 bics p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 bics p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brka p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkas p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 brkb p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkbs p0.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 brkn p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkns p0.b, p15/z, p1.b, p0.b +# CHECK-NEXT: 1 2 1.00 brkns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpa p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpa p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 4 1.00 brkpas p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 4 1.00 brkpas p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 brkpb p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 2 1.00 brkpb p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 4 1.00 brkpbs p0.b, p15/z, p1.b, p2.b +# CHECK-NEXT: 1 4 1.00 brkpbs p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 bsl z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 bsl1n z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 bsl2n z0.d, z0.d, z1.d, z2.d @@ -3704,163 +3704,163 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 cmla z31.h, z31.h, z31.h, #180 # CHECK-NEXT: 1 4 0.50 cmla z31.s, z30.s, z7.s[0], #180 # CHECK-NEXT: 1 4 0.50 cmla z31.s, z31.s, z31.s, #180 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpeq p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpeq p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpge p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpge p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpgt p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmpgt p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphi p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphi p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.b, p0/z, z1.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.d, p0/z, z1.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphs p0.h, p0/z, z1.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z0.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmphs p0.s, p0/z, z1.s, z0.s -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmple p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmple p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmplo p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.d, p0/z, z0.d, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.d, p0/z, z0.d, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, #0 -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, #127 -# CHECK-NEXT: 1 3 0.50 cmpls p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmplt p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, z0.b -# CHECK-NEXT: 1 3 0.50 cmpne p0.b, p0/z, z0.b, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.d, p0/z, z0.d, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.h, p0/z, z0.h, z0.h -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, #-16 -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, #15 -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, z0.d -# CHECK-NEXT: 1 3 0.50 cmpne p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpeq p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpeq p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpge p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpge p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpgt p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmpgt p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphi p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphi p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.b, p0/z, z1.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.d, p0/z, z1.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphs p0.h, p0/z, z1.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z0.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmphs p0.s, p0/z, z1.s, z0.s +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmple p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmple p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmplo p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.d, p0/z, z0.d, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.d, p0/z, z0.d, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, #0 +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, #127 +# CHECK-NEXT: 1 4 0.50 cmpls p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmplt p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, z0.b +# CHECK-NEXT: 1 4 0.50 cmpne p0.b, p0/z, z0.b, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.d, p0/z, z0.d, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.h, p0/z, z0.h, z0.h +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, #-16 +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, #15 +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, z0.d +# CHECK-NEXT: 1 4 0.50 cmpne p0.s, p0/z, z0.s, z0.s # CHECK-NEXT: 1 3 0.50 cnot z31.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 cnot z31.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 cnot z31.h, p7/m, z31.h # CHECK-NEXT: 1 3 0.50 cnot z31.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 cnt z31.b, p7/m, z31.b +# CHECK-NEXT: 1 3 0.50 cnt z31.b, p7/m, z31.b # CHECK-NEXT: 1 12 0.50 cnt z31.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 cnt z31.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 cnt z31.h, p7/m, z31.h # CHECK-NEXT: 1 8 0.50 cnt z31.s, p7/m, z31.s # CHECK-NEXT: 1 1 0.33 cntb x0 # CHECK-NEXT: 1 1 0.33 cntb x0, #28 @@ -3874,10 +3874,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 cnth x0, #28 # CHECK-NEXT: 1 1 0.33 cnth x0, all, mul #16 # CHECK-NEXT: 1 1 0.33 cnth x0, pow2 -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.b -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.d -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.h -# CHECK-NEXT: 1 6 1.00 cntp x0, p15, p0.s +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.b +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.d +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.h +# CHECK-NEXT: 1 4 1.00 cntp x0, p15, p0.s # CHECK-NEXT: 1 1 0.33 cntw x0 # CHECK-NEXT: 1 1 0.33 cntw x0, #28 # CHECK-NEXT: 1 1 0.33 cntw x0, all, mul #16 @@ -3892,42 +3892,42 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 ctermne wzr, w30 # CHECK-NEXT: 1 1 0.33 ctermne x30, xzr # CHECK-NEXT: 1 1 0.33 ctermne xzr, x30 -# CHECK-NEXT: 1 1 0.33 decb x0 -# CHECK-NEXT: 1 1 0.33 decb x0, #14 -# CHECK-NEXT: 1 1 0.33 decb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decb x0, pow2 -# CHECK-NEXT: 1 1 0.33 decb x0, vl1 -# CHECK-NEXT: 1 1 0.33 decd x0 -# CHECK-NEXT: 1 1 0.33 decd x0, #14 -# CHECK-NEXT: 1 1 0.33 decd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decd x0, pow2 -# CHECK-NEXT: 1 1 0.33 decd x0, vl1 -# CHECK-NEXT: 1 1 0.33 dech x0 -# CHECK-NEXT: 1 1 0.33 dech x0, #14 -# CHECK-NEXT: 1 1 0.33 dech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 dech x0, pow2 -# CHECK-NEXT: 1 1 0.33 dech x0, vl1 -# CHECK-NEXT: 1 6 1.00 decp x0, p0.b -# CHECK-NEXT: 1 6 1.00 decp x0, p0.d -# CHECK-NEXT: 1 6 1.00 decp x0, p0.h -# CHECK-NEXT: 1 6 1.00 decp x0, p0.s -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.b -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.d -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.h -# CHECK-NEXT: 1 6 1.00 decp xzr, p15.s +# CHECK-NEXT: 1 3 0.33 decb x0 +# CHECK-NEXT: 1 3 0.33 decb x0, #14 +# CHECK-NEXT: 1 3 0.33 decb x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decb x0, pow2 +# CHECK-NEXT: 1 3 0.33 decb x0, vl1 +# CHECK-NEXT: 1 3 0.33 decd x0 +# CHECK-NEXT: 1 3 0.33 decd x0, #14 +# CHECK-NEXT: 1 3 0.33 decd x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decd x0, pow2 +# CHECK-NEXT: 1 3 0.33 decd x0, vl1 +# CHECK-NEXT: 1 3 0.33 dech x0 +# CHECK-NEXT: 1 3 0.33 dech x0, #14 +# CHECK-NEXT: 1 3 0.33 dech x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 dech x0, pow2 +# CHECK-NEXT: 1 3 0.33 dech x0, vl1 +# CHECK-NEXT: 1 4 1.00 decp x0, p0.b +# CHECK-NEXT: 1 4 1.00 decp x0, p0.d +# CHECK-NEXT: 1 4 1.00 decp x0, p0.h +# CHECK-NEXT: 1 4 1.00 decp x0, p0.s +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.b +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.d +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.h +# CHECK-NEXT: 1 4 1.00 decp xzr, p15.s # CHECK-NEXT: 1 4 0.50 decp z31.d, p15.d # CHECK-NEXT: 1 4 0.50 decp z31.h, p15.h # CHECK-NEXT: 1 4 0.50 decp z31.s, p15.s -# CHECK-NEXT: 1 1 0.33 decw x0 -# CHECK-NEXT: 1 1 0.33 decw x0, #14 -# CHECK-NEXT: 1 1 0.33 decw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 decw x0, pow2 -# CHECK-NEXT: 1 1 0.33 decw x0, vl1 +# CHECK-NEXT: 1 3 0.33 decw x0 +# CHECK-NEXT: 1 3 0.33 decw x0, #14 +# CHECK-NEXT: 1 3 0.33 decw x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 decw x0, pow2 +# CHECK-NEXT: 1 3 0.33 decw x0, vl1 # CHECK-NEXT: 1 4 0.50 dupm z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 4 0.50 dupm z0.s, #0xfffffff9 # CHECK-NEXT: 1 4 0.50 dupm z23.h, #0xfff9 # CHECK-NEXT: 1 4 0.50 dupm z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 eor p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 eor p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 eor z0.d, z0.d, z0.d @@ -3942,12 +3942,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 eor z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 eor z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 3 0.50 eor3 z29.d, z29.d, z30.d, z31.d +# CHECK-NEXT: 1 4 0.50 eor3 z29.d, z29.d, z30.d, z31.d # CHECK-NEXT: 1 4 0.50 eorbt z0.b, z1.b, z31.b # CHECK-NEXT: 1 4 0.50 eorbt z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 eorbt z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 0.50 eorbt z0.s, z1.s, z31.s -# CHECK-NEXT: 1 6 1.00 eors p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 eors p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 0.50 eortb z0.b, z1.b, z31.b # CHECK-NEXT: 1 4 0.50 eortb z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 eortb z0.h, z1.h, z31.h @@ -4303,49 +4303,49 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 ftsmul z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 ftsmul z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 0.50 ftsmul z0.s, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 ftssel z0.d, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 ftssel z0.h, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 ftssel z0.s, z1.s, z31.s +# CHECK-NEXT: 1 3 0.50 ftssel z0.d, z1.d, z31.d +# CHECK-NEXT: 1 3 0.50 ftssel z0.h, z1.h, z31.h +# CHECK-NEXT: 1 3 0.50 ftssel z0.s, z1.s, z31.s # CHECK-NEXT: 1 8 2.00 histcnt z0.s, p0/z, z1.s, z2.s # CHECK-NEXT: 1 8 2.00 histcnt z29.d, p7/z, z30.d, z31.d # CHECK-NEXT: 1 8 2.00 histseg z0.b, z1.b, z31.b -# CHECK-NEXT: 1 1 0.33 incb x0 -# CHECK-NEXT: 1 1 0.33 incb x0, #14 -# CHECK-NEXT: 1 1 0.33 incb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incb x0, pow2 -# CHECK-NEXT: 1 1 0.33 incb x0, vl1 -# CHECK-NEXT: 1 1 0.33 incd x0 -# CHECK-NEXT: 1 1 0.33 incd x0, #14 -# CHECK-NEXT: 1 1 0.33 incd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incd x0, pow2 -# CHECK-NEXT: 1 1 0.33 incd x0, vl1 -# CHECK-NEXT: 1 4 0.50 incd z0.d -# CHECK-NEXT: 1 4 0.50 incd z0.d, all, mul #16 -# CHECK-NEXT: 1 1 0.33 inch x0 -# CHECK-NEXT: 1 1 0.33 inch x0, #14 -# CHECK-NEXT: 1 1 0.33 inch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 inch x0, pow2 -# CHECK-NEXT: 1 1 0.33 inch x0, vl1 -# CHECK-NEXT: 1 4 0.50 inch z0.h -# CHECK-NEXT: 1 4 0.50 inch z0.h, all, mul #16 -# CHECK-NEXT: 1 6 1.00 incp x0, p0.b -# CHECK-NEXT: 1 6 1.00 incp x0, p0.d -# CHECK-NEXT: 1 6 1.00 incp x0, p0.h -# CHECK-NEXT: 1 6 1.00 incp x0, p0.s -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.b -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.d -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.h -# CHECK-NEXT: 1 6 1.00 incp xzr, p15.s +# CHECK-NEXT: 1 3 0.33 incb x0 +# CHECK-NEXT: 1 3 0.33 incb x0, #14 +# CHECK-NEXT: 1 3 0.33 incb x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incb x0, pow2 +# CHECK-NEXT: 1 3 0.33 incb x0, vl1 +# CHECK-NEXT: 1 3 0.33 incd x0 +# CHECK-NEXT: 1 3 0.33 incd x0, #14 +# CHECK-NEXT: 1 3 0.33 incd x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incd x0, pow2 +# CHECK-NEXT: 1 3 0.33 incd x0, vl1 +# CHECK-NEXT: 1 3 0.50 incd z0.d +# CHECK-NEXT: 1 3 0.50 incd z0.d, all, mul #16 +# CHECK-NEXT: 1 3 0.33 inch x0 +# CHECK-NEXT: 1 3 0.33 inch x0, #14 +# CHECK-NEXT: 1 3 0.33 inch x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 inch x0, pow2 +# CHECK-NEXT: 1 3 0.33 inch x0, vl1 +# CHECK-NEXT: 1 3 0.50 inch z0.h +# CHECK-NEXT: 1 3 0.50 inch z0.h, all, mul #16 +# CHECK-NEXT: 1 4 1.00 incp x0, p0.b +# CHECK-NEXT: 1 4 1.00 incp x0, p0.d +# CHECK-NEXT: 1 4 1.00 incp x0, p0.h +# CHECK-NEXT: 1 4 1.00 incp x0, p0.s +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.b +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.d +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.h +# CHECK-NEXT: 1 4 1.00 incp xzr, p15.s # CHECK-NEXT: 1 4 0.50 incp z31.d, p15.d # CHECK-NEXT: 1 4 0.50 incp z31.h, p15.h # CHECK-NEXT: 1 4 0.50 incp z31.s, p15.s -# CHECK-NEXT: 1 1 0.33 incw x0 -# CHECK-NEXT: 1 1 0.33 incw x0, #14 -# CHECK-NEXT: 1 1 0.33 incw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 incw x0, pow2 -# CHECK-NEXT: 1 1 0.33 incw x0, vl1 -# CHECK-NEXT: 1 4 0.50 incw z0.s -# CHECK-NEXT: 1 4 0.50 incw z0.s, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incw x0 +# CHECK-NEXT: 1 3 0.33 incw x0, #14 +# CHECK-NEXT: 1 3 0.33 incw x0, all, mul #16 +# CHECK-NEXT: 1 3 0.33 incw x0, pow2 +# CHECK-NEXT: 1 3 0.33 incw x0, vl1 +# CHECK-NEXT: 1 3 0.50 incw z0.s +# CHECK-NEXT: 1 3 0.50 incw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 index z0.b, #0, #0 # CHECK-NEXT: 1 4 0.50 index z0.d, #0, #0 # CHECK-NEXT: 1 4 0.50 index z0.h, #0, #0 @@ -4412,8 +4412,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1b { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * ld1b { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1b { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1b { z21.b }, p5/z, [x10, #5, mul vl] @@ -4450,8 +4450,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1h { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * ld1h { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1h { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1h { z21.d }, p5/z, [x10, #5, mul vl] @@ -4467,8 +4467,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * ld1h { z31.h }, p7/z, [sp, #-1, mul vl] # CHECK-NEXT: 1 3 0.50 * ld1h { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 9 9.00 * ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: 1 3 0.50 * ld1h { z5.h }, p3/z, [x17, x16, lsl #1] @@ -4529,7 +4529,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [sp, x0] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.h }, p0/z, [x0] -# CHECK-NEXT: 1 9 4.50 * ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] # CHECK-NEXT: 1 3 0.50 * ld1sb { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1sb { z21.d }, p5/z, [x10, #5, mul vl] @@ -4549,8 +4549,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: 1 3 0.50 * ld1sh { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1sh { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1sh { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1sh { z21.d }, p5/z, [x10, #5, mul vl] @@ -4565,8 +4565,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 7 7.00 * ld1sh { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 9 9.00 * ld1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 7 7.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] # CHECK-NEXT: 1 7 7.00 * ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] @@ -4585,8 +4585,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: 1 3 0.50 * ld1w { z0.d }, p0/z, [x0] # CHECK-NEXT: 1 7 7.00 * ld1w { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: 1 9 4.50 * ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 3 0.50 * ld1w { z0.s }, p0/z, [x0] # CHECK-NEXT: 1 9 9.00 * ld1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 3 0.50 * ld1w { z21.d }, p5/z, [x10, #5, mul vl] @@ -4601,8 +4601,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * ld1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 7 7.00 * ld1w { z31.d }, p7/z, [z31.d, #124] # CHECK-NEXT: 1 3 0.50 * ld1w { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: 1 9 4.50 * ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: 1 9 4.50 * ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 7 3.50 * ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 3.50 * ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: 1 9 9.00 * ld1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: 1 3 2.00 * ld2b { z0.b, z1.b }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 1.00 * ld2b { z0.b, z1.b }, p0/z, [x0] @@ -4668,8 +4668,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1b { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * U ldff1b { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: 1 9 4.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4696,8 +4696,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4706,16 +4706,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.h }, p7/z, [sp] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 3 0.50 * U ldff1h { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.d }, p0/z, [x0, x0] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: 1 3 0.50 * U ldff1sb { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: 1 9 4.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4730,8 +4730,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4739,8 +4739,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1sh { z31.d }, p7/z, [z31.d, #62] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: 1 9 4.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: 1 7 3.50 * U ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: 1 3 0.50 * U ldff1sh { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: 1 3 0.50 * U ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] @@ -4758,8 +4758,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z0.d }, p0/z, [z0.d] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: 1 9 9.00 * U ldff1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -4767,8 +4767,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z31.d }, p7/z, [sp] # CHECK-NEXT: 1 7 7.00 * U ldff1w { z31.d }, p7/z, [z31.d, #124] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: 1 9 4.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: 1 7 3.50 * U ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: 1 3 0.50 * U ldff1w { z31.s }, p7/z, [sp] # CHECK-NEXT: 1 9 9.00 * U ldff1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: 1 3 0.50 * U ldnf1b { z0.b }, p0/z, [x0] @@ -4959,12 +4959,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 mls z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 0.50 mls z0.s, p7/m, z1.s, z31.s # CHECK-NEXT: 1 4 0.50 mls z0.s, z1.s, z7.s[3] -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0/m, p0.b -# CHECK-NEXT: 1 6 1.00 mov p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15/m, p15.b -# CHECK-NEXT: 1 6 1.00 mov p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/m, p0.b +# CHECK-NEXT: 1 2 1.00 mov p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/m, p15.b +# CHECK-NEXT: 1 2 1.00 mov p15.b, p15/z, p15.b # CHECK-NEXT: 1 3 0.50 mov z0.b, #127 # CHECK-NEXT: 1 3 0.50 mov z0.b, b0 # CHECK-NEXT: 1 3 0.50 mov z0.b, p0/m, b0 @@ -5062,10 +5062,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 mov z5.h, #-6 # CHECK-NEXT: 1 3 0.50 mov z5.q, z17.q[3] # CHECK-NEXT: 1 3 0.50 mov z5.s, #-6 -# CHECK-NEXT: 1 6 1.00 movs p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 movs p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 movs p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 movs p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 movs p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 movs p15.b, p15/z, p15.b # CHECK-NEXT: 1 1 1.00 U mrs x3, ID_AA64ZFR0_EL1 # CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL1 # CHECK-NEXT: 1 1 1.00 U mrs x3, ZCR_EL12 @@ -5098,10 +5098,10 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 mul z31.h, z31.h, #127 # CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #-128 # CHECK-NEXT: 1 4 0.50 mul z31.s, z31.s, #127 -# CHECK-NEXT: 1 6 1.00 nand p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nand p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 nands p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nands p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nand p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nand p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nands p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nands p15.b, p15/z, p15.b, p15.b # CHECK-NEXT: 1 3 0.50 nbsl z0.d, z0.d, z1.d, z2.d # CHECK-NEXT: 1 3 0.50 neg z0.b, p0/m, z0.b # CHECK-NEXT: 1 3 0.50 neg z0.d, p0/m, z0.d @@ -5115,23 +5115,23 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 nmatch p0.h, p0/z, z0.h, z0.h # CHECK-NEXT: 1 7 1.00 nmatch p15.b, p7/z, z30.b, z31.b # CHECK-NEXT: 1 7 1.00 nmatch p15.h, p7/z, z30.h, z31.h -# CHECK-NEXT: 1 6 1.00 nor p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nor p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 nors p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 nors p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 not p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 not p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 nor p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nor p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 nors p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 nors p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 not p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 not p15.b, p15/z, p15.b # CHECK-NEXT: 1 3 0.50 not z31.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 not z31.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 not z31.h, p7/m, z31.h # CHECK-NEXT: 1 3 0.50 not z31.s, p7/m, z31.s -# CHECK-NEXT: 1 6 1.00 nots p0.b, p0/z, p0.b -# CHECK-NEXT: 1 6 1.00 nots p15.b, p15/z, p15.b -# CHECK-NEXT: 1 6 1.00 orn p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 orn p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 orns p0.b, p0/z, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 orns p15.b, p15/z, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 orr p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 nots p0.b, p0/z, p0.b +# CHECK-NEXT: 1 2 1.00 nots p15.b, p15/z, p15.b +# CHECK-NEXT: 1 2 1.00 orn p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orn p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orns p0.b, p0/z, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 orns p15.b, p15/z, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 orr p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0x6 # CHECK-NEXT: 1 3 0.50 orr z0.d, z0.d, #0xfffffffffffffff9 # CHECK-NEXT: 1 3 0.50 orr z0.s, z0.s, #0x6 @@ -5145,27 +5145,27 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 orr z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0x6 # CHECK-NEXT: 1 3 0.50 orr z5.b, z5.b, #0xf9 -# CHECK-NEXT: 1 6 1.00 orrs p0.b, p0/z, p0.b, p1.b +# CHECK-NEXT: 1 2 1.00 orrs p0.b, p0/z, p0.b, p1.b # CHECK-NEXT: 1 4 1.00 orv b0, p7, z31.b # CHECK-NEXT: 1 4 1.00 orv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 orv h0, p7, z31.h # CHECK-NEXT: 1 4 1.00 orv s0, p7, z31.s -# CHECK-NEXT: 1 6 1.00 pfalse p15.b -# CHECK-NEXT: 1 6 1.00 pfirst p0.b, p15, p0.b -# CHECK-NEXT: 1 6 1.00 pfirst p15.b, p15, p15.b +# CHECK-NEXT: 1 2 1.00 pfalse p15.b +# CHECK-NEXT: 1 2 1.00 pfirst p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pfirst p15.b, p15, p15.b # CHECK-NEXT: 1 4 0.50 pmul z0.b, z1.b, z2.b # CHECK-NEXT: 1 4 0.50 pmul z29.b, z30.b, z31.b -# CHECK-NEXT: 1 6 1.00 pmullb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 6 1.00 pmullb z29.q, z30.d, z31.d -# CHECK-NEXT: 1 6 1.00 pmullb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 pmullt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 6 1.00 pmullt z29.q, z30.d, z31.d -# CHECK-NEXT: 1 6 1.00 pmullt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 pnext p0.b, p15, p0.b -# CHECK-NEXT: 1 6 1.00 pnext p0.d, p15, p0.d -# CHECK-NEXT: 1 6 1.00 pnext p0.h, p15, p0.h -# CHECK-NEXT: 1 6 1.00 pnext p0.s, p15, p0.s -# CHECK-NEXT: 1 6 1.00 pnext p15.b, p15, p15.b +# CHECK-NEXT: 1 9 1.00 pmullb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 1.00 pmullb z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 1.00 pmullb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 9 1.00 pmullt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 9 1.00 pmullt z29.q, z30.d, z31.d +# CHECK-NEXT: 1 9 1.00 pmullt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 pnext p0.b, p15, p0.b +# CHECK-NEXT: 1 2 1.00 pnext p0.d, p15, p0.d +# CHECK-NEXT: 1 2 1.00 pnext p0.h, p15, p0.h +# CHECK-NEXT: 1 2 1.00 pnext p0.s, p15, p0.s +# CHECK-NEXT: 1 2 1.00 pnext p15.b, p15, p15.b # CHECK-NEXT: 1 0 0.50 * * U prfb #14, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfb #15, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfb #6, p0, [x0] @@ -5274,97 +5274,97 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 0 0.50 * * U prfw pstl2strm, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfw pstl3keep, p0, [x0] # CHECK-NEXT: 1 0 0.50 * * U prfw pstl3strm, p0, [x0] -# CHECK-NEXT: 1 6 1.00 ptest p15, p0.b -# CHECK-NEXT: 1 6 1.00 ptest p15, p15.b -# CHECK-NEXT: 1 6 1.00 ptrue p0.b, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.d, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.h, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p0.s, pow2 -# CHECK-NEXT: 1 6 1.00 ptrue p15.b -# CHECK-NEXT: 1 6 1.00 ptrue p15.d -# CHECK-NEXT: 1 6 1.00 ptrue p15.h -# CHECK-NEXT: 1 6 1.00 ptrue p15.s -# CHECK-NEXT: 1 6 1.00 ptrue p7.s -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #14 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #15 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #16 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #17 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #18 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #19 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #20 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #21 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #22 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #23 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #24 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #25 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #26 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #27 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, #28 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, mul3 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, mul4 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl1 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl128 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl16 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl2 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl256 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl3 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl32 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl4 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl5 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl6 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl64 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl7 -# CHECK-NEXT: 1 6 1.00 ptrue p7.s, vl8 -# CHECK-NEXT: 1 6 1.00 ptrues p0.b, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.d, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.h, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p0.s, pow2 -# CHECK-NEXT: 1 6 1.00 ptrues p15.b -# CHECK-NEXT: 1 6 1.00 ptrues p15.d -# CHECK-NEXT: 1 6 1.00 ptrues p15.h -# CHECK-NEXT: 1 6 1.00 ptrues p15.s -# CHECK-NEXT: 1 6 1.00 ptrues p7.s -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #14 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #15 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #16 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #17 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #18 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #19 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #20 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #21 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #22 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #23 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #24 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #25 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #26 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #27 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, #28 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, mul3 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, mul4 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl1 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl128 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl16 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl2 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl256 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl3 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl32 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl4 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl5 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl6 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl64 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl7 -# CHECK-NEXT: 1 6 1.00 ptrues p7.s, vl8 -# CHECK-NEXT: 1 6 1.00 punpkhi p0.h, p0.b -# CHECK-NEXT: 1 6 1.00 punpkhi p15.h, p15.b -# CHECK-NEXT: 1 6 1.00 punpklo p0.h, p0.b -# CHECK-NEXT: 1 6 1.00 punpklo p15.h, p15.b -# CHECK-NEXT: 1 4 0.50 raddhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 raddhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 raddhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 raddhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 raddhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 raddhnt z0.s, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 rax1 z0.d, z1.d, z31.d +# CHECK-NEXT: 1 2 1.00 ptest p15, p0.b +# CHECK-NEXT: 1 2 1.00 ptest p15, p15.b +# CHECK-NEXT: 1 2 1.00 ptrue p0.b, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.d, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.h, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p0.s, pow2 +# CHECK-NEXT: 1 2 1.00 ptrue p15.b +# CHECK-NEXT: 1 2 1.00 ptrue p15.d +# CHECK-NEXT: 1 2 1.00 ptrue p15.h +# CHECK-NEXT: 1 2 1.00 ptrue p15.s +# CHECK-NEXT: 1 2 1.00 ptrue p7.s +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #14 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #15 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #16 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #17 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #18 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #19 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #20 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #21 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #22 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #23 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #24 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #25 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #26 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #27 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, #28 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, mul3 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, mul4 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl1 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl128 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl16 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl2 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl256 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl3 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl32 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl4 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl5 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl6 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl64 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl7 +# CHECK-NEXT: 1 2 1.00 ptrue p7.s, vl8 +# CHECK-NEXT: 1 2 1.00 ptrues p0.b, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.d, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.h, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p0.s, pow2 +# CHECK-NEXT: 1 2 1.00 ptrues p15.b +# CHECK-NEXT: 1 2 1.00 ptrues p15.d +# CHECK-NEXT: 1 2 1.00 ptrues p15.h +# CHECK-NEXT: 1 2 1.00 ptrues p15.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #14 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #15 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #17 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #18 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #19 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #20 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #21 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #22 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #23 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #24 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #25 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #26 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #27 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, #28 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, mul4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl1 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl128 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl16 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl2 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl256 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl3 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl32 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl4 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl5 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl6 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl64 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl7 +# CHECK-NEXT: 1 2 1.00 ptrues p7.s, vl8 +# CHECK-NEXT: 1 2 1.00 punpkhi p0.h, p0.b +# CHECK-NEXT: 1 2 1.00 punpkhi p15.h, p15.b +# CHECK-NEXT: 1 2 1.00 punpklo p0.h, p0.b +# CHECK-NEXT: 1 2 1.00 punpklo p15.h, p15.b +# CHECK-NEXT: 1 8 0.50 raddhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 raddhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 raddhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 raddhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 raddhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 raddhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 9 1.00 rax1 z0.d, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 rbit z0.b, p7/m, z31.b # CHECK-NEXT: 1 3 0.50 rbit z0.d, p7/m, z31.d # CHECK-NEXT: 1 3 0.50 rbit z0.h, p7/m, z31.h @@ -5379,16 +5379,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.33 rdvl x21, #-32 # CHECK-NEXT: 1 1 0.33 rdvl x23, #31 # CHECK-NEXT: 1 1 0.33 rdvl xzr, #-1 -# CHECK-NEXT: 1 4 0.50 rev z0.b, z31.b -# CHECK-NEXT: 1 4 0.50 rev z0.d, z31.d -# CHECK-NEXT: 1 4 0.50 rev z0.h, z31.h -# CHECK-NEXT: 1 4 0.50 rev z0.s, z31.s -# CHECK-NEXT: 1 4 0.50 revb z0.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 revb z0.h, p7/m, z31.h -# CHECK-NEXT: 1 4 0.50 revb z0.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 revh z0.d, p7/m, z31.d -# CHECK-NEXT: 1 4 0.50 revh z0.s, p7/m, z31.s -# CHECK-NEXT: 1 4 0.50 revw z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 rev z0.b, z31.b +# CHECK-NEXT: 1 3 0.50 rev z0.d, z31.d +# CHECK-NEXT: 1 3 0.50 rev z0.h, z31.h +# CHECK-NEXT: 1 3 0.50 rev z0.s, z31.s +# CHECK-NEXT: 1 3 0.50 revb z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revb z0.h, p7/m, z31.h +# CHECK-NEXT: 1 3 0.50 revb z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revh z0.d, p7/m, z31.d +# CHECK-NEXT: 1 3 0.50 revh z0.s, p7/m, z31.s +# CHECK-NEXT: 1 3 0.50 revw z0.d, p7/m, z31.d # CHECK-NEXT: 1 4 0.50 rshrnb z0.b, z0.h, #1 # CHECK-NEXT: 1 4 0.50 rshrnb z0.h, z0.s, #1 # CHECK-NEXT: 1 4 0.50 rshrnb z0.s, z0.d, #1 @@ -5401,22 +5401,22 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 rshrnt z31.b, z31.h, #8 # CHECK-NEXT: 1 4 0.50 rshrnt z31.h, z31.s, #16 # CHECK-NEXT: 1 4 0.50 rshrnt z31.s, z31.d, #32 -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 rsubhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 rsubhnt z0.s, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 saba z0.b, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 saba z0.d, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 saba z0.h, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 saba z0.s, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 sabalb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 sabalt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 sabalt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 sabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 rsubhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 rsubhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 saba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 saba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 saba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 saba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 sabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 sabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 sabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 sabalt z0.s, z1.h, z31.h # CHECK-NEXT: 1 3 0.50 sabd z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 sabd z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 sabd z31.h, p7/m, z31.h, z31.h @@ -5430,24 +5430,24 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 sadalp z0.h, p0/m, z1.b # CHECK-NEXT: 1 7 1.00 sadalp z29.s, p0/m, z30.h # CHECK-NEXT: 1 7 1.00 sadalp z30.d, p7/m, z31.s -# CHECK-NEXT: 1 3 0.50 saddlb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 saddlb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 saddlbt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 saddlbt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 saddlbt z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 saddlt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 saddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 saddlbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 saddlbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 saddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 saddlt z31.d, z31.s, z31.s # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.b # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.h # CHECK-NEXT: 1 4 1.00 saddv d0, p7, z31.s -# CHECK-NEXT: 1 3 0.50 saddwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 saddwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 saddwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 saddwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 saddwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 saddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 saddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 saddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 saddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 saddwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 sbclb z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 0.50 sbclb z0.s, z1.s, z31.s # CHECK-NEXT: 1 4 0.50 sbclt z0.d, z1.d, z31.d @@ -5504,8 +5504,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sli z31.d, z31.d, #63 # CHECK-NEXT: 1 3 0.50 sli z31.h, z31.h, #15 # CHECK-NEXT: 1 3 0.50 sli z31.s, z31.s, #31 -# CHECK-NEXT: 1 8 1.00 sm4e z0.s, z0.s, z31.s -# CHECK-NEXT: 1 8 1.00 sm4ekey z0.s, z1.s, z31.s +# CHECK-NEXT: 1 9 1.00 sm4e z0.s, z0.s, z31.s +# CHECK-NEXT: 1 9 1.00 sm4ekey z0.s, z1.s, z31.s # CHECK-NEXT: 1 3 0.50 smax z0.b, z0.b, #-128 # CHECK-NEXT: 1 3 0.50 smax z0.d, z0.d, #-128 # CHECK-NEXT: 1 3 0.50 smax z0.h, z0.h, #-128 @@ -5624,61 +5624,61 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 sqcadd z31.d, z31.d, z31.d, #270 # CHECK-NEXT: 1 4 0.50 sqcadd z31.h, z31.h, z31.h, #270 # CHECK-NEXT: 1 4 0.50 sqcadd z31.s, z31.s, z31.s, #270 -# CHECK-NEXT: 1 1 0.33 sqdecb x0 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecb x0, w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecd x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 sqdecd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0 -# CHECK-NEXT: 1 1 0.33 sqdech x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdech x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdech x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0 +# CHECK-NEXT: 1 4 0.33 sqdech x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdech x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdech x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdech x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdech z0.h # CHECK-NEXT: 1 4 0.50 sqdech z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2 # CHECK-NEXT: 1 4 0.50 sqdech z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.b -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.d -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.h -# CHECK-NEXT: 1 8 1.00 sqdecp x0, p0.s -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.b, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.d, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.h, wzr -# CHECK-NEXT: 1 8 1.00 sqdecp xzr, p15.s, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.b +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.d +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.h +# CHECK-NEXT: 1 9 1.00 sqdecp x0, p0.s +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.b, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.d, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.h, wzr +# CHECK-NEXT: 1 9 1.00 sqdecp xzr, p15.s, wzr # CHECK-NEXT: 1 4 0.50 sqdecp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 sqdecp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 sqdecp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 sqdecw x0 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, #14 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqdecw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, #14 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqdecw x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecw z0.s # CHECK-NEXT: 1 4 0.50 sqdecw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqdecw z0.s, pow2 @@ -5726,61 +5726,61 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 sqdmullt z0.s, z1.h, z7.h[7] # CHECK-NEXT: 1 4 0.50 sqdmullt z29.s, z30.h, z31.h # CHECK-NEXT: 1 4 0.50 sqdmullt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 1 0.33 sqincb x0 -# CHECK-NEXT: 1 1 0.33 sqincb x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincb x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincb x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincb x0, w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0 -# CHECK-NEXT: 1 1 0.33 sqincd x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincd x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincd x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0 +# CHECK-NEXT: 1 4 0.33 sqincb x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincb x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincb x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0 +# CHECK-NEXT: 1 4 0.33 sqincd x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincd x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincd x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqincd z0.d # CHECK-NEXT: 1 4 0.50 sqincd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 sqincd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0 -# CHECK-NEXT: 1 1 0.33 sqinch x0, #14 -# CHECK-NEXT: 1 1 0.33 sqinch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqinch x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqinch x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0 +# CHECK-NEXT: 1 4 0.33 sqinch x0, #14 +# CHECK-NEXT: 1 4 0.33 sqinch x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqinch x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqinch x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqinch z0.h # CHECK-NEXT: 1 4 0.50 sqinch z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2 # CHECK-NEXT: 1 4 0.50 sqinch z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.b -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.d -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.h -# CHECK-NEXT: 1 8 1.00 sqincp x0, p0.s -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.b, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.d, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.h, wzr -# CHECK-NEXT: 1 8 1.00 sqincp xzr, p15.s, wzr +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.b +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.d +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.h +# CHECK-NEXT: 1 9 1.00 sqincp x0, p0.s +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.b, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.d, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.h, wzr +# CHECK-NEXT: 1 9 1.00 sqincp xzr, p15.s, wzr # CHECK-NEXT: 1 4 0.50 sqincp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 sqincp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 sqincp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 sqincw x0 -# CHECK-NEXT: 1 1 0.33 sqincw x0, #14 -# CHECK-NEXT: 1 1 0.33 sqincw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincw x0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincw x0, vl1 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, pow2 -# CHECK-NEXT: 1 1 0.33 sqincw x0, w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0 +# CHECK-NEXT: 1 4 0.33 sqincw x0, #14 +# CHECK-NEXT: 1 4 0.33 sqincw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincw x0, vl1 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, pow2 +# CHECK-NEXT: 1 4 0.33 sqincw x0, w0, pow2, mul #16 # CHECK-NEXT: 1 4 0.50 sqincw z0.s # CHECK-NEXT: 1 4 0.50 sqincw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 sqincw z0.s, pow2 @@ -6001,24 +6001,24 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 ssra z31.d, z31.d, #64 # CHECK-NEXT: 1 4 0.50 ssra z31.h, z31.h, #16 # CHECK-NEXT: 1 4 0.50 ssra z31.s, z31.s, #32 -# CHECK-NEXT: 1 3 0.50 ssublb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 ssublb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssublbt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssublbt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 ssublbt z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 ssublt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssublt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssubltb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 3 0.50 ssubltb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 3 0.50 ssubltb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 ssubwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 ssubwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 ssubwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 ssubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ssublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssublbt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssublbt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 ssublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 4 0.50 ssubltb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 4 0.50 ssubltb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 ssubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 ssubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 ssubwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0, x0] # CHECK-NEXT: 1 1 1.00 * st1b { z0.b }, p0, [x0] # CHECK-NEXT: 1 1 1.00 * st1b { z0.d }, p0, [x0, x0] @@ -6250,12 +6250,12 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sub z31.s, p7/m, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, #65280 # CHECK-NEXT: 1 3 0.50 sub z31.s, z31.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnb z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 subhnb z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnb z0.s, z1.d, z31.d -# CHECK-NEXT: 1 4 0.50 subhnt z0.b, z1.h, z31.h -# CHECK-NEXT: 1 4 0.50 subhnt z0.h, z1.s, z31.s -# CHECK-NEXT: 1 4 0.50 subhnt z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 subhnb z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 subhnb z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 subhnb z0.s, z1.d, z31.d +# CHECK-NEXT: 1 8 0.50 subhnt z0.b, z1.h, z31.h +# CHECK-NEXT: 1 8 0.50 subhnt z0.h, z1.s, z31.s +# CHECK-NEXT: 1 8 0.50 subhnt z0.s, z1.d, z31.d # CHECK-NEXT: 1 3 0.50 subr z0.b, p0/m, z0.b, z0.b # CHECK-NEXT: 1 3 0.50 subr z0.b, z0.b, #0 # CHECK-NEXT: 1 3 0.50 subr z0.d, p0/m, z0.d, z0.d @@ -6305,32 +6305,32 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 tbx z31.d, z31.d, z31.d # CHECK-NEXT: 1 4 0.50 tbx z31.h, z31.h, z31.h # CHECK-NEXT: 1 4 0.50 tbx z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 trn1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 trn1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 trn1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 trn1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 trn1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 trn1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 trn1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 trn1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 trn2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 trn2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 trn2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 trn2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 trn2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 trn2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 trn2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 trn2 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 8 1.00 uaba z0.b, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uaba z0.d, z1.d, z31.d -# CHECK-NEXT: 1 8 1.00 uaba z0.h, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 uaba z0.s, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalb z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalb z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uabalb z0.s, z1.h, z31.h -# CHECK-NEXT: 1 8 1.00 uabalt z0.d, z1.s, z31.s -# CHECK-NEXT: 1 8 1.00 uabalt z0.h, z1.b, z31.b -# CHECK-NEXT: 1 8 1.00 uabalt z0.s, z1.h, z31.h +# CHECK-NEXT: 1 2 1.00 trn1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 trn1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 trn1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 trn1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 trn2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 trn2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 trn2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 trn2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 trn2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 trn2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 trn2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 trn2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 6 1.00 uaba z0.b, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uaba z0.d, z1.d, z31.d +# CHECK-NEXT: 1 6 1.00 uaba z0.h, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 uaba z0.s, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalb z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalb z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uabalb z0.s, z1.h, z31.h +# CHECK-NEXT: 1 6 1.00 uabalt z0.d, z1.s, z31.s +# CHECK-NEXT: 1 6 1.00 uabalt z0.h, z1.b, z31.b +# CHECK-NEXT: 1 6 1.00 uabalt z0.s, z1.h, z31.h # CHECK-NEXT: 1 3 0.50 uabd z31.b, p7/m, z31.b, z31.b # CHECK-NEXT: 1 3 0.50 uabd z31.d, p7/m, z31.d, z31.d # CHECK-NEXT: 1 3 0.50 uabd z31.h, p7/m, z31.h, z31.h @@ -6344,22 +6344,22 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 7 1.00 uadalp z0.h, p0/m, z1.b # CHECK-NEXT: 1 7 1.00 uadalp z29.s, p0/m, z30.h # CHECK-NEXT: 1 7 1.00 uadalp z30.d, p7/m, z31.s -# CHECK-NEXT: 1 3 0.50 uaddlb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 uaddlb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 uaddlb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 uaddlt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 uaddlt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 uaddlt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 uaddlt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 uaddlt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 uaddlt z31.d, z31.s, z31.s # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.b # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.d # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.h # CHECK-NEXT: 1 4 1.00 uaddv d0, p7, z31.s -# CHECK-NEXT: 1 3 0.50 uaddwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 uaddwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 uaddwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 uaddwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 uaddwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 uaddwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uaddwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 uaddwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 uaddwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 uaddwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 uaddwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 uaddwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.d # CHECK-NEXT: 1 4 0.50 ucvtf z0.d, p0/m, z0.s # CHECK-NEXT: 1 4 0.50 ucvtf z0.h, p0/m, z0.d @@ -6473,120 +6473,120 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 uqadd z31.d, z31.d, #65280 # CHECK-NEXT: 1 4 0.50 uqadd z31.h, z31.h, #65280 # CHECK-NEXT: 1 4 0.50 uqadd z31.s, z31.s, #65280 -# CHECK-NEXT: 1 1 0.33 uqdecb w0 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecb w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb x0 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecb x0, vl1 -# CHECK-NEXT: 1 1 0.33 uqdecd w0 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecd w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd x0 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecd x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecb w0 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecb w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb x0 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecb x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecd w0 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecd w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd x0 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecd x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 uqdecd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech w0 -# CHECK-NEXT: 1 1 0.33 uqdech w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdech w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech x0 -# CHECK-NEXT: 1 1 0.33 uqdech x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdech x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdech x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdech x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdech w0 +# CHECK-NEXT: 1 4 0.33 uqdech w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdech w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech x0 +# CHECK-NEXT: 1 4 0.33 uqdech x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdech x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdech x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdech x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdech z0.h # CHECK-NEXT: 1 4 0.50 uqdech z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2 # CHECK-NEXT: 1 4 0.50 uqdech z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.b -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.d -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.h -# CHECK-NEXT: 1 8 1.00 uqdecp wzr, p15.s -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.b -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.d -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.h -# CHECK-NEXT: 1 8 1.00 uqdecp x0, p0.s +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.b +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.d +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.h +# CHECK-NEXT: 1 9 1.00 uqdecp wzr, p15.s +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.b +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.d +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.h +# CHECK-NEXT: 1 9 1.00 uqdecp x0, p0.s # CHECK-NEXT: 1 4 0.50 uqdecp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 uqdecp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 uqdecp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 uqdecw w0 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecw w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw x0 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, #14 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqdecw x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqdecw w0 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecw w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw x0 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, #14 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqdecw x0, vl1 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2 # CHECK-NEXT: 1 4 0.50 uqdecw z0.s, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb w0 -# CHECK-NEXT: 1 1 0.33 uqincb w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincb w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb x0 -# CHECK-NEXT: 1 1 0.33 uqincb x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincb x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincb x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincb x0, vl1 -# CHECK-NEXT: 1 1 0.33 uqincd w0 -# CHECK-NEXT: 1 1 0.33 uqincd w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincd w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd x0 -# CHECK-NEXT: 1 1 0.33 uqincd x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincd x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincd x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincd x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincb w0 +# CHECK-NEXT: 1 4 0.33 uqincb w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincb w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb x0 +# CHECK-NEXT: 1 4 0.33 uqincb x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincb x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincb x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincb x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincd w0 +# CHECK-NEXT: 1 4 0.33 uqincd w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincd w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd x0 +# CHECK-NEXT: 1 4 0.33 uqincd x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincd x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincd x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincd x0, vl1 # CHECK-NEXT: 1 4 0.50 uqincd z0.d # CHECK-NEXT: 1 4 0.50 uqincd z0.d, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2 # CHECK-NEXT: 1 4 0.50 uqincd z0.d, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch w0 -# CHECK-NEXT: 1 1 0.33 uqinch w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqinch w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch x0 -# CHECK-NEXT: 1 1 0.33 uqinch x0, #14 -# CHECK-NEXT: 1 1 0.33 uqinch x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqinch x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqinch x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqinch w0 +# CHECK-NEXT: 1 4 0.33 uqinch w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqinch w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch x0 +# CHECK-NEXT: 1 4 0.33 uqinch x0, #14 +# CHECK-NEXT: 1 4 0.33 uqinch x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqinch x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqinch x0, vl1 # CHECK-NEXT: 1 4 0.50 uqinch z0.h # CHECK-NEXT: 1 4 0.50 uqinch z0.h, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2 # CHECK-NEXT: 1 4 0.50 uqinch z0.h, pow2, mul #16 -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.b -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.d -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.h -# CHECK-NEXT: 1 8 1.00 uqincp wzr, p15.s -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.b -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.d -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.h -# CHECK-NEXT: 1 8 1.00 uqincp x0, p0.s +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.b +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.d +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.h +# CHECK-NEXT: 1 9 1.00 uqincp wzr, p15.s +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.b +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.d +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.h +# CHECK-NEXT: 1 9 1.00 uqincp x0, p0.s # CHECK-NEXT: 1 4 0.50 uqincp z0.d, p0.d # CHECK-NEXT: 1 4 0.50 uqincp z0.h, p0.h # CHECK-NEXT: 1 4 0.50 uqincp z0.s, p0.s -# CHECK-NEXT: 1 1 0.33 uqincw w0 -# CHECK-NEXT: 1 1 0.33 uqincw w0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw w0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincw w0, pow2, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw x0 -# CHECK-NEXT: 1 1 0.33 uqincw x0, #14 -# CHECK-NEXT: 1 1 0.33 uqincw x0, all, mul #16 -# CHECK-NEXT: 1 1 0.33 uqincw x0, pow2 -# CHECK-NEXT: 1 1 0.33 uqincw x0, vl1 +# CHECK-NEXT: 1 4 0.33 uqincw w0 +# CHECK-NEXT: 1 4 0.33 uqincw w0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw w0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincw w0, pow2, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw x0 +# CHECK-NEXT: 1 4 0.33 uqincw x0, #14 +# CHECK-NEXT: 1 4 0.33 uqincw x0, all, mul #16 +# CHECK-NEXT: 1 4 0.33 uqincw x0, pow2 +# CHECK-NEXT: 1 4 0.33 uqincw x0, vl1 # CHECK-NEXT: 1 4 0.50 uqincw z0.s # CHECK-NEXT: 1 4 0.50 uqincw z0.s, all, mul #16 # CHECK-NEXT: 1 4 0.50 uqincw z0.s, pow2 @@ -6723,18 +6723,18 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 4 0.50 usra z31.d, z31.d, #64 # CHECK-NEXT: 1 4 0.50 usra z31.h, z31.h, #16 # CHECK-NEXT: 1 4 0.50 usra z31.s, z31.s, #32 -# CHECK-NEXT: 1 3 0.50 usublb z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 usublb z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 usublb z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 usublt z0.h, z1.b, z2.b -# CHECK-NEXT: 1 3 0.50 usublt z29.s, z30.h, z31.h -# CHECK-NEXT: 1 3 0.50 usublt z31.d, z31.s, z31.s -# CHECK-NEXT: 1 3 0.50 usubwb z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 usubwb z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 usubwb z31.d, z31.d, z31.s -# CHECK-NEXT: 1 3 0.50 usubwt z0.h, z1.h, z2.b -# CHECK-NEXT: 1 3 0.50 usubwt z29.s, z30.s, z31.h -# CHECK-NEXT: 1 3 0.50 usubwt z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 usublb z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublb z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublb z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usublt z0.h, z1.b, z2.b +# CHECK-NEXT: 1 4 0.50 usublt z29.s, z30.h, z31.h +# CHECK-NEXT: 1 4 0.50 usublt z31.d, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 usubwb z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwb z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwb z31.d, z31.d, z31.s +# CHECK-NEXT: 1 4 0.50 usubwt z0.h, z1.h, z2.b +# CHECK-NEXT: 1 4 0.50 usubwt z29.s, z30.s, z31.h +# CHECK-NEXT: 1 4 0.50 usubwt z31.d, z31.d, z31.s # CHECK-NEXT: 1 4 0.50 uunpkhi z31.d, z31.s # CHECK-NEXT: 1 4 0.50 uunpkhi z31.h, z31.b # CHECK-NEXT: 1 4 0.50 uunpkhi z31.s, z31.h @@ -6753,82 +6753,82 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 3 0.50 uxth z31.s, p7/m, z31.s # CHECK-NEXT: 1 3 0.50 uxtw z0.d, p0/m, z0.d # CHECK-NEXT: 1 3 0.50 uxtw z31.d, p7/m, z31.d -# CHECK-NEXT: 1 6 1.00 uzp1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 uzp1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 uzp1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 uzp1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 uzp1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 uzp1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 uzp1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 uzp1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 uzp2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 uzp2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 uzp2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 uzp2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 uzp2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 uzp2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 uzp2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 uzp2 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 whilege p15.b, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.b, wzr, w0 -# CHECK-NEXT: 1 6 1.00 whilege p15.b, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.b, xzr, x0 -# CHECK-NEXT: 1 6 1.00 whilege p15.d, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.d, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.h, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.h, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilege p15.s, w0, wzr -# CHECK-NEXT: 1 6 1.00 whilege p15.s, x0, xzr -# CHECK-NEXT: 1 6 1.00 whilerw p15.b, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.d, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.h, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilerw p15.s, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.b, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.d, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.h, x30, x30 -# CHECK-NEXT: 1 6 1.00 whilewr p15.s, x30, x30 +# CHECK-NEXT: 1 2 1.00 uzp1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 uzp1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 uzp1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 uzp1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 uzp2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 uzp2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 uzp2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 uzp2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 uzp2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 uzp2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 uzp2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 uzp2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 whilege p15.b, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, wzr, w0 +# CHECK-NEXT: 1 2 1.00 whilege p15.b, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.b, xzr, x0 +# CHECK-NEXT: 1 2 1.00 whilege p15.d, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.d, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.h, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, w0, wzr +# CHECK-NEXT: 1 2 1.00 whilege p15.s, x0, xzr +# CHECK-NEXT: 1 2 1.00 whilerw p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilerw p15.s, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.b, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.d, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.h, x30, x30 +# CHECK-NEXT: 1 2 1.00 whilewr p15.s, x30, x30 # CHECK-NEXT: 1 1 0.33 * U wrffr p0.b # CHECK-NEXT: 1 1 0.33 * U wrffr p15.b -# CHECK-NEXT: 1 3 0.50 xar z0.b, z0.b, z1.b, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.d, z0.d, z1.d, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.h, z0.h, z1.h, #1 -# CHECK-NEXT: 1 3 0.50 xar z0.s, z0.s, z1.s, #1 -# CHECK-NEXT: 1 3 0.50 xar z31.b, z31.b, z30.b, #8 -# CHECK-NEXT: 1 3 0.50 xar z31.d, z31.d, z30.d, #64 -# CHECK-NEXT: 1 3 0.50 xar z31.h, z31.h, z30.h, #16 -# CHECK-NEXT: 1 3 0.50 xar z31.s, z31.s, z30.s, #32 -# CHECK-NEXT: 1 6 1.00 zip1 p0.b, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 zip1 p0.d, p0.d, p0.d -# CHECK-NEXT: 1 6 1.00 zip1 p0.h, p0.h, p0.h -# CHECK-NEXT: 1 6 1.00 zip1 p0.s, p0.s, p0.s -# CHECK-NEXT: 1 6 1.00 zip1 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 zip1 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 zip1 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 zip1 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 zip1 z0.b, z0.b, z0.b -# CHECK-NEXT: 1 4 0.50 zip1 z0.d, z0.d, z0.d -# CHECK-NEXT: 1 4 0.50 zip1 z0.h, z0.h, z0.h -# CHECK-NEXT: 1 4 0.50 zip1 z0.s, z0.s, z0.s -# CHECK-NEXT: 1 4 0.50 zip1 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 zip1 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 zip1 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 zip1 z31.s, z31.s, z31.s -# CHECK-NEXT: 1 6 1.00 zip2 p0.b, p0.b, p0.b -# CHECK-NEXT: 1 6 1.00 zip2 p0.d, p0.d, p0.d -# CHECK-NEXT: 1 6 1.00 zip2 p0.h, p0.h, p0.h -# CHECK-NEXT: 1 6 1.00 zip2 p0.s, p0.s, p0.s -# CHECK-NEXT: 1 6 1.00 zip2 p15.b, p15.b, p15.b -# CHECK-NEXT: 1 6 1.00 zip2 p15.d, p15.d, p15.d -# CHECK-NEXT: 1 6 1.00 zip2 p15.h, p15.h, p15.h -# CHECK-NEXT: 1 6 1.00 zip2 p15.s, p15.s, p15.s -# CHECK-NEXT: 1 4 0.50 zip2 z0.b, z0.b, z0.b -# CHECK-NEXT: 1 4 0.50 zip2 z0.d, z0.d, z0.d -# CHECK-NEXT: 1 4 0.50 zip2 z0.h, z0.h, z0.h -# CHECK-NEXT: 1 4 0.50 zip2 z0.s, z0.s, z0.s -# CHECK-NEXT: 1 4 0.50 zip2 z31.b, z31.b, z31.b -# CHECK-NEXT: 1 4 0.50 zip2 z31.d, z31.d, z31.d -# CHECK-NEXT: 1 4 0.50 zip2 z31.h, z31.h, z31.h -# CHECK-NEXT: 1 4 0.50 zip2 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 4 0.50 xar z0.b, z0.b, z1.b, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.d, z0.d, z1.d, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.h, z0.h, z1.h, #1 +# CHECK-NEXT: 1 4 0.50 xar z0.s, z0.s, z1.s, #1 +# CHECK-NEXT: 1 4 0.50 xar z31.b, z31.b, z30.b, #8 +# CHECK-NEXT: 1 4 0.50 xar z31.d, z31.d, z30.d, #64 +# CHECK-NEXT: 1 4 0.50 xar z31.h, z31.h, z30.h, #16 +# CHECK-NEXT: 1 4 0.50 xar z31.s, z31.s, z30.s, #32 +# CHECK-NEXT: 1 2 1.00 zip1 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 zip1 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 2 1.00 zip1 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 2 1.00 zip1 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 2 1.00 zip1 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 zip1 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 zip1 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 zip1 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip1 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip1 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip1 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip1 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip1 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip1 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip1 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip1 z31.s, z31.s, z31.s +# CHECK-NEXT: 1 2 1.00 zip2 p0.b, p0.b, p0.b +# CHECK-NEXT: 1 2 1.00 zip2 p0.d, p0.d, p0.d +# CHECK-NEXT: 1 2 1.00 zip2 p0.h, p0.h, p0.h +# CHECK-NEXT: 1 2 1.00 zip2 p0.s, p0.s, p0.s +# CHECK-NEXT: 1 2 1.00 zip2 p15.b, p15.b, p15.b +# CHECK-NEXT: 1 2 1.00 zip2 p15.d, p15.d, p15.d +# CHECK-NEXT: 1 2 1.00 zip2 p15.h, p15.h, p15.h +# CHECK-NEXT: 1 2 1.00 zip2 p15.s, p15.s, p15.s +# CHECK-NEXT: 1 3 0.50 zip2 z0.b, z0.b, z0.b +# CHECK-NEXT: 1 3 0.50 zip2 z0.d, z0.d, z0.d +# CHECK-NEXT: 1 3 0.50 zip2 z0.h, z0.h, z0.h +# CHECK-NEXT: 1 3 0.50 zip2 z0.s, z0.s, z0.s +# CHECK-NEXT: 1 3 0.50 zip2 z31.b, z31.b, z31.b +# CHECK-NEXT: 1 3 0.50 zip2 z31.d, z31.d, z31.d +# CHECK-NEXT: 1 3 0.50 zip2 z31.h, z31.h, z31.h +# CHECK-NEXT: 1 3 0.50 zip2 z31.s, z31.s, z31.s # CHECK: Resources: # CHECK-NEXT: [0] - CortexA510UnitALU0 @@ -6848,7 +6848,7 @@ zip2 z31.s, z31.s, z31.s # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] -# CHECK-NEXT: 79.00 75.00 75.00 9.00 - 240.00 3698.00 - - 1290.00 924.00 199.50 199.50 670.00 +# CHECK-NEXT: 79.00 75.00 75.00 9.00 - 209.00 3667.00 - - 1290.00 924.00 199.50 199.50 670.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1.0] [1.1] [2] [3] [4] [5] [6] [7] [8] [9] [10.0] [10.1] [11] Instructions: @@ -7844,8 +7844,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1b { z21.b }, p5/z, [x10, #5, mul vl] @@ -7882,8 +7882,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z21.d }, p5/z, [x10, #5, mul vl] @@ -7899,8 +7899,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z31.h }, p7/z, [sp, #-1, mul vl] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z5.h }, p3/z, [sp, x16, lsl #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1h { z5.h }, p3/z, [x17, x16, lsl #1] @@ -7961,7 +7961,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [sp, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.h }, p0/z, [x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sb { z21.d }, p5/z, [x10, #5, mul vl] @@ -7981,8 +7981,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z21.d }, p5/z, [x10, #5, mul vl] @@ -7997,8 +7997,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sh { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] @@ -8017,8 +8017,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z0.d }, p0/z, [x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z0.d }, p0/z, [z0.d] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z0.s }, p0/z, [x0] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z21.d }, p5/z, [x10, #5, mul vl] @@ -8033,8 +8033,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ld1w { z31.d }, p7/z, [z31.d, #124] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, #-1, mul vl] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ld1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ld1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: - - - - - - 2.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - - 1.00 - - - - - - - ld2b { z0.b, z1.b }, p0/z, [x0] @@ -8100,8 +8100,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1b { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1b { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1b { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8128,8 +8128,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z0.h }, p0/z, [x0, x0, lsl #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1h { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8138,16 +8138,16 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1h { z31.d }, p7/z, [z31.d, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.h }, p7/z, [sp] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1h { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1h { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.d }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.h }, p0/z, [x0, x0] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, x0] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sb { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sb { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8162,8 +8162,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, x0, lsl #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sh { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8171,8 +8171,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1sh { z31.d }, p7/z, [z31.d, #62] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, sxtw #1] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp, z31.s, uxtw #1] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sh { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1sh { z31.s }, p7/z, [z31.s, #62] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1sw { z0.d }, p0/z, [x0, x0, lsl #2] @@ -8190,8 +8190,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z0.d }, p0/z, [z0.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, x0, lsl #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1w { z0.s }, p0/z, [z0.s] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, sxtw] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z21.d }, p5/z, [x10, z21.d, uxtw] @@ -8199,8 +8199,8 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z31.d }, p7/z, [sp, z31.d] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z31.d }, p7/z, [sp] # CHECK-NEXT: - - - - - - 7.00 - - - - - - - ldff1w { z31.d }, p7/z, [z31.d, #124] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] -# CHECK-NEXT: - - - - - 4.50 4.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, sxtw #2] +# CHECK-NEXT: - - - - - 3.50 3.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp, z31.s, uxtw #2] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldff1w { z31.s }, p7/z, [sp] # CHECK-NEXT: - - - - - - 9.00 - - - - - - - ldff1w { z31.s }, p7/z, [z31.s, #124] # CHECK-NEXT: - - - - - 0.50 0.50 - - - - - - - ldnf1b { z0.b }, p0/z, [x0]