diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td index 6b788772ac889..c73f60a1a7741 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td @@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0, let NumMicroOps = 16; } +//===----------------------------------------------------------------------===// +// Define forwarded types +// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? + +def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>; + +def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>; + +def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>; + +def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>; + +def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>; + +def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; } +def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>; + +def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; } +def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>; + +def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; } +def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>; + +def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>; + +def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; } +def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>; + +def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>; + +def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>; + +def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; } +def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>; + +def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>; + +def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; } +def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>; + +def N3Wr_ZA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; } +def N3Rd_ZA : SchedReadAdvance<3, [N3Wr_ZA]>; +def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; } +def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>; +def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; } +def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>; + +def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]> { let Latency = 3; } +def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>; +def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>; + +def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>; +def N3Wr_ZCMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; } +def N3Rd_ZCMAD : SchedReadAdvance<2, [N3Wr_ZCMAD]>; + +def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; } +def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>; + +def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>; +def N3Wr_ZMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; } +def N3Rd_ZMAD : SchedReadAdvance<2, [N3Wr_ZMAD]>; + +def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>; + +def N3Wr_ZMASQL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; } +def N3Wr_ZMASQD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; } +def N3Rd_ZMASQ : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS, + N3Wr_ZMASQD]>; + +def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>; + +def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>; + +def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>; + +def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>; +def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; } +def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>; +def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; } +def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>; + // Miscellaneous // ----------------------------------------------------------------------------- @@ -832,10 +933,11 @@ def : SchedAlias; def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>; // FP multiply -def : SchedAlias; +def : WriteRes { let Latency = 3; } // FP multiply accumulate -def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>; +def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA], + (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>; // FP round to integral def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>; @@ -969,7 +1071,7 @@ def : SchedAlias; // ASIMD absolute diff accum long // ASIMD pairwise add and accumulate long // ASIMD shift accumulate -def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v", +def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv", "^[SU]R?SRAv")>; @@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>; // ASIMD dot product // ASIMD dot product using signed and unsigned integers -def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; +def : InstRW<[N3Wr_VDOT, N3Rd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>; // ASIMD matrix multiply-accumulate -def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>; +def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; // ASIMD max/min, reduce, 4H/4S def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>; @@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>; def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>; // ASIMD multiply accumulate -def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>; +def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>; // ASIMD multiply accumulate high -def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; +def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; // ASIMD multiply accumulate long -def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; +def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; // ASIMD multiply accumulate saturating long -def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>; +def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>; // ASIMD multiply/multiply long (8x8) polynomial, D-form // ASIMD multiply/multiply long (8x8) polynomial, Q-form @@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1], def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>; // ASIMD FP complex multiply add -def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>; +def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>; // ASIMD FP convert, long (F16 to F32) def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>; @@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>; def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>; // ASIMD FP multiply -def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>; +def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>; // ASIMD FP multiply accumulate -def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>; +def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>; // ASIMD FP multiply accumulate long -def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>; +def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>; // ASIMD FP round, D-form F32 and Q-form F64 def : InstRW<[N3Write_3c_1V0], @@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>; def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>; // ASIMD dot product -def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>; +def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>; // ASIMD matrix multiply accumulate -def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>; +def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>; // ASIMD multiply accumulate long -def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>; +def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA], + (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>; // Scalar convert, F32 to BF16 def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>; @@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>; // ----------------------------------------------------------------------------- // CRC checksum ops -def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>; +def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>; // SVE Predicate instructions // ----------------------------------------------------------------------------- @@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]", "^[SU]ABD_ZPZZ_[BHSD]")>; // Arithmetic, absolute diff accum -def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>; +def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>; // Arithmetic, absolute diff accum long -def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>; +def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>; // Arithmetic, absolute diff long def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>; @@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>; def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>; // Arithmetic, pairwise add and accum long -def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>; +def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA], + (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>; // Arithmetic, shift def : InstRW<[N3Write_2c_1V1], @@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1], "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>; // Arithmetic, shift and accumulate -def : InstRW<[N3Write_4c_1V1], +def : InstRW<[N3Wr_ZSA, N3Rd_ZSA], (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>; // Arithmetic, shift by immediate @@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V], def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>; // Complex dot product 8-bit element -def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; +def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>; // Complex dot product 16-bit element -def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; +def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>; // Complex multiply-add B, H, S element size -def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>; +def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS], + (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>; // Complex multiply-add D element size -def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>; +def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>; // Conditional extract operations, scalar form def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>; @@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>; +def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>; // Dot product, 8 bit, using signed and unsigned integers -def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; +def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], + (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>; // Dot product, 16 bit -def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>; +def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>; // Duplicate, immediate and indexed form def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$", @@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]", def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>; // Matrix multiply-accumulate -def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; +def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; // Move prefix def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", @@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$", "^[SU]MULL[BT]_ZZZ_[HSD]$")>; // Multiply accumulate, B, H, S element size -def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$", - "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>; +def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS], + (instregex "^ML[AS]_ZZZI_[BHS]$", + "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>; // Multiply accumulate, D element size -def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$", +def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$", "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>; // Multiply accumulate long -def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$", +def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$", "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>; // Multiply accumulate saturating doubling long regular -def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$", - "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>; +def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ], + (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$", + "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>; // Multiply saturating doubling high, B, H, S element size def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$", @@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$", "^SQDMULL[BT]_ZZZI_[SD]$")>; // Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size -def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$", +def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$", "^SQRDCMLAH_ZZZ_[BHS]$", "^SQRDML[AS]H_ZZZI_[HS]$", "^SQRDCMLAH_ZZZI_[HS]$")>; // Multiply saturating rounding doubling regular/complex accumulate, D element size -def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$", +def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$", "^SQRDCMLAH_ZZZ_D$")>; // Multiply saturating rounding doubling regular/complex, B, H, S element size @@ -1948,8 +2057,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$", def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>; // Floating point complex multiply add -def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$", - "^FCMLA_ZZZI_[HS]$")>; +def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA], + (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; // Floating point convert, long or narrow (F16 to F32 or F32 to F16) def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)", @@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", "^FMUL_ZPZ[IZ]_[HSD]")>; // Floating point multiply accumulate -def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$", - "^FN?ML[AS]_ZPZZZ_[HSD]", - "^FML[AS]_ZZZI_[HSD]$")>; +def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; // Floating point multiply add/sub accumulate long -def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>; +def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>; // Floating point reciprocal estimate, F16 def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>; @@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>; def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; // Dot product -def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; +def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>; +def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>; // Multiply accumulate long -def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; +def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; // SVE Load instructions // ----------------------------------------------------------------------------- diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s new file mode 100644 index 0000000000000..f6b9db13624b6 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s @@ -0,0 +1,2034 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n3 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s + +# LLVM-MCA-BEGIN madd +mul x0, x0, x0 +madd x0, x1, x2, x0 +madd x0, x1, x2, x0 +madd x0, x0, x0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smaddl +mul x0, x0, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w0, w0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmadd +fadd d0, d0, d0 +fmadd d0, d1, d2, d0 +fmul d0, d0, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d0, d1, d2 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN saba +mul v0.4s, v0.4s, v0.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sdot +mul v0.4s, v0.4s, v0.4s +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smmla +mul v0.4s, v0.4s, v0.4s +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN mla +mul v0.4s, v0.4s, v0.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sqrdmlah +mul v0.4s, v0.4s, v0.4s +sqrdmlah v0.8h, v1.8h, v2.8h +sqrdmlah v0.8h, v1.8h, v2.8h +sqrdmlah v0.8h, v1.8h, v2.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smlal2 +mul v0.4s, v0.4s, v0.4s +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sqdmlal2 +mul v0.4s, v0.4s, v0.4s +sqdmlal2 v0.4s, v1.8h, v2.8h +sqdmlal2 v0.4s, v1.8h, v2.8h +sqdmlal2 v0.4s, v1.8h, v2.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sadalp +mul v0.4s, v0.4s, v0.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v0.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fcmla +fmul v0.4s, v0.4s, v0.4s +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v0.2d, v1.2d, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmla +fmul v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fadd v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v0.2d, v1.2d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmlal +fmul v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fadd v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v0.4h, v1.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfdot +fmul v0.2d, v0.2d, v0.2d +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmmla +fmul v0.2d, v0.2d, v0.2d +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul v0.2d, v0.2d, v0.2d +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN crc32 +mul w0, w0, w0 +crc32cb w0, w0, w1 +crc32cb w0, w0, w1 +crc32cb w0, w0, w0 +crc32b w0, w0, w15 +crc32h w0, w0, w21 +crc32w w0, w0, w24 +crc32x w0, w0, x25 +crc32ch w0, w0, w16 +crc32cw w0, w0, w23 +crc32cx w0, w0, x5 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z saba +mul z0.d, z0.d, z0.d +saba z0.d, z1.d, z2.d +saba z0.d, z1.d, z2.d +saba z0.d, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sabalt +mul z0.h, z0.h, z0.h +sabalt z0.h, z1.b, z2.b +sabalt z0.h, z1.b, z2.b +sabalt z0.h, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sadalp +mul z0.d, z0.d, z0.d +sadalp z0.d, p0/m, z1.s +sadalp z0.d, p0/m, z1.s +sadalp z0.d, p0/m, z0.s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z ssra +mul z0.d, z0.d, z0.d +ssra z0.d, z1.d, #1 +ssra z0.d, z1.d, #1 +ssra z0.d, z0.d, #1 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z cdot.s +mul z0.d, z0.d, z0.d +cdot z0.s, z1.b, z2.b, #90 +cdot z0.s, z1.b, z2.b, #90 +cdot z0.s, z0.b, z1.b, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z cdot.d +mul z0.d, z0.d, z0.d +cdot z0.d, z1.h, z2.h, #90 +cdot z0.d, z1.h, z2.h, #90 +cdot z0.d, z0.h, z1.h, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z cmla.b +mul z0.d, z0.d, z0.d +cmla z0.b, z1.b, z2.b, #90 +cmla z0.b, z1.b, z2.b, #90 +cmla z0.b, z0.b, z1.b, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z cmla.d +mul z0.d, z0.d, z0.d +cmla z0.d, z1.d, z2.d, #90 +cmla z0.d, z1.d, z2.d, #90 +cmla z0.d, z0.d, z1.d, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.s +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b +sdot z0.s, z1.b, z2.b +sdot z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sudot +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z0.b, z1.b[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.d +mul z0.d, z0.d, z0.d +sdot z0.d, z1.h, z2.h +sdot z0.d, z1.h, z2.h +sdot z0.d, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z smmla +mul z0.s, z0.s, z0.s +smmla z0.s, z1.b, z2.b +smmla z0.s, z1.b, z2.b +smmla z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mla.b +mul z0.d, z0.d, z0.d +mla z0.b, p0/m, z1.b, z2.b +mla z0.b, p0/m, z1.b, z2.b +mla z0.b, p0/m, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mla.d +mul z0.d, z0.d, z0.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z smlalb +mul z0.d, z0.d, z0.d +smlalb z0.d, z1.s, z2.s +smlalb z0.d, z1.s, z2.s +smlalb z0.d, z0.s, z1.s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sqdmlalb +mul z0.d, z0.d, z0.d +sqdmlalb z0.d, z1.s, z2.s +sqdmlalb z0.d, z1.s, z2.s +sqdmlalb z0.d, z0.s, z1.s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sqrdmlah.b +mul z0.d, z0.d, z0.d +sqrdmlah z0.b, z1.b, z2.b +sqrdmlah z0.b, z1.b, z2.b +sqrdmlah z0.b, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sqrdmlah.d +mul z0.d, z0.d, z0.d +sqrdmlah z0.d, z1.d, z2.d +sqrdmlah z0.d, z1.d, z2.d +sqrdmlah z0.d, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZPmZZ +fmul z0.d, z0.d, z0.d +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z0.d, z1.d, 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZZZI +fmul z0.d, z0.d, z0.d +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z0.s, z1.s[1], 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZPmZZ +fmul z0.d, z0.d, z0.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZZZI +fmul z0.d, z0.d, z0.d +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z0.d, z1.d[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmlalb ZZZ +fmul z0.d, z0.d, z0.d +fmlalb z0.s, z1.h, z2.h +fmlalb z0.s, z1.h, z2.h +fmlalb z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfdot +fmul z0.d, z0.d, z0.d +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfmmla +fmul z0.d, z0.d, z0.d +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul z0.d, z0.d, z0.d +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z0.h, z1.h +# LLVM-MCA-END + +# CHECK: [0] Code Region - madd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] .D========eeER .. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,2] .D=========eeER.. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,3] .D===========eeER madd x0, x0, x0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.0 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 2. 2 7.0 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 2 9.0 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 2 6.6 0.1 0.0 + +# CHECK: [1] Code Region - smaddl + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] .D========eeER .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,2] .D=========eeER.. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,3] .D===========eeER smaddl x0, w0, w0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.0 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 2. 2 7.0 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 3. 2 9.0 0.0 0.0 smaddl x0, w0, w0, x0 +# CHECK-NEXT: 2 6.6 0.1 0.0 + +# CHECK: [2] Code Region - fmadd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeER. . . . . . .. fadd d0, d0, d0 +# CHECK-NEXT: [0,1] D==eeeeER . . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,2] D======eeeER . . . . .. fmul d0, d0, d0 +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmadd d0, d0, d1, d2 +# CHECK-NEXT: [1,0] .D================eeER . . .. fadd d0, d0, d0 +# CHECK-NEXT: [1,1] .D==================eeeeER . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,2] .D======================eeeER . .. fmul d0, d0, d0 +# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,4] . D========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,5] . D============================eeeeER fmadd d0, d0, d1, d2 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fadd d0, d0, d0 +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 fmul d0, d0, d0 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 4. 2 17.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 5. 2 21.0 0.0 0.0 fmadd d0, d0, d1, d2 +# CHECK-NEXT: 2 14.9 0.1 0.0 + +# CHECK: [3] Code Region - saba + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D================eeeeER . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER saba v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 11.6 0.1 0.0 + +# CHECK: [4] Code Region - sdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D==============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] .D===============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] .D==================eeeER sdot v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.4 0.1 0.0 + +# CHECK: [5] Code Region - smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D==============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] .D===============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] .D==================eeeER smmla v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.4 0.1 0.0 + +# CHECK: [6] Code Region - mla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D================eeeeER . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER mla v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 11.6 0.1 0.0 + +# CHECK: [7] Code Region - sqrdmlah + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeER . . . .. mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . .. sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeER . . .. sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D========eeeeER. . .. sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: [1,0] D============eeeeER . .. mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D===============eeeeER .. sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=================eeeeER.. sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D===================eeeeER sqrdmlah v0.8h, v1.8h, v2.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sqrdmlah v0.8h, v1.8h, v2.8h +# CHECK-NEXT: 2 11.1 0.1 0.0 + +# CHECK: [8] Code Region - smlal2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D=====================eeeeER smlal2 v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.6 0.1 0.0 + +# CHECK: [9] Code Region - sqdmlal2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 4.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeER . . . .. mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . .. sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeER . . .. sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D========eeeeER. . .. sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,0] D============eeeeER . .. mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D===============eeeeER .. sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=================eeeeER.. sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D===================eeeeER sqdmlal2 v0.4s, v1.8h, v2.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sqdmlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2 11.1 0.1 0.0 + +# CHECK: [10] Code Region - sadalp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D================eeeeER . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,2] .D=================eeeeER. . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,3] .D=====================eeeeER sadalp v0.2d, v0.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sadalp v0.2d, v0.4s +# CHECK-NEXT: 2 11.6 0.1 0.0 + +# CHECK: [11] Code Region - fcmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [12] Code Region - fmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,2] D=====eeER. . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,5] .D============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: [1,0] .D================eeeER . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D=================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,2] .D=====================eeER . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] .D=======================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,4] . D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,5] . D============================eeeeER fmla v0.2d, v0.2d, v1.2d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 2. 2 14.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 4. 2 17.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 5. 2 21.0 0.0 0.0 fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: 2 14.6 0.1 0.0 + +# CHECK: [13] Code Region - fmlal + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1903 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.32 +# CHECK-NEXT: IPC: 0.32 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0 + +# CHECK: [0,0] DeeeER . . . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . . . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,2] D=======eeER . . . . . . fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D=========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,4] D===========eeeeER . . . . . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,5] .D==============eeeeER . . . . fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: [1,0] .D==================eeeER. . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D=====================eeeeER . . . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,2] .D=========================eeER . . fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] .D===========================eeeeER. . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,4] . D============================eeeeER . fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,5] . D================================eeeeER fmlal v0.4s, v0.4h, v1.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 10.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 2. 2 17.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 4. 2 20.5 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 5. 2 24.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: 2 17.3 0.1 0.0 + +# CHECK: [14] Code Region - bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D===============eeeeER . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=================eeeeER. . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D=====================eeeeER bfdot v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [15] Code Region - bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D==================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D==========================eeeeeER bfmmla v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 13.6 0.1 0.0 + +# CHECK: [16] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] .D===============eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] .D=================eeeeER. . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] .D=====================eeeeER bfmlalb v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [17] Code Region - crc32 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 1100 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 1100 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.78 +# CHECK-NEXT: IPC: 0.78 +# CHECK-NEXT: Block RThroughput: 10.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeER. . . . . . mul w0, w0, w0 +# CHECK-NEXT: [0,1] D==eeER . . . . . crc32cb w0, w0, w1 +# CHECK-NEXT: [0,2] D===eeER . . . . . crc32cb w0, w0, w1 +# CHECK-NEXT: [0,3] D=====eeER. . . . . crc32cb w0, w0, w0 +# CHECK-NEXT: [0,4] D======eeER . . . . crc32b w0, w0, w15 +# CHECK-NEXT: [0,5] .D======eeER . . . . crc32h w0, w0, w21 +# CHECK-NEXT: [0,6] .D=======eeER . . . . crc32w w0, w0, w24 +# CHECK-NEXT: [0,7] .D========eeER . . . . crc32x w0, w0, x25 +# CHECK-NEXT: [0,8] .D=========eeER. . . . crc32ch w0, w0, w16 +# CHECK-NEXT: [0,9] .D==========eeER . . . crc32cw w0, w0, w23 +# CHECK-NEXT: [0,10] . D==========eeER . . . crc32cx w0, w0, x5 +# CHECK-NEXT: [1,0] . D============eeER . . . mul w0, w0, w0 +# CHECK-NEXT: [1,1] . D==============eeER . . crc32cb w0, w0, w1 +# CHECK-NEXT: [1,2] . D===============eeER . . crc32cb w0, w0, w1 +# CHECK-NEXT: [1,3] . D=================eeER . . crc32cb w0, w0, w0 +# CHECK-NEXT: [1,4] . D=================eeER. . crc32b w0, w0, w15 +# CHECK-NEXT: [1,5] . D==================eeER . crc32h w0, w0, w21 +# CHECK-NEXT: [1,6] . D===================eeER . crc32w w0, w0, w24 +# CHECK-NEXT: [1,7] . D====================eeER . crc32x w0, w0, x25 +# CHECK-NEXT: [1,8] . D=====================eeER . crc32ch w0, w0, w16 +# CHECK-NEXT: [1,9] . D=====================eeER. crc32cw w0, w0, w23 +# CHECK-NEXT: [1,10] . D======================eeER crc32cx w0, w0, x5 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul w0, w0, w0 +# CHECK-NEXT: 1. 2 9.0 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 2. 2 10.0 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 3. 2 12.0 0.0 0.0 crc32cb w0, w0, w0 +# CHECK-NEXT: 4. 2 12.5 0.0 0.0 crc32b w0, w0, w15 +# CHECK-NEXT: 5. 2 13.0 0.0 0.0 crc32h w0, w0, w21 +# CHECK-NEXT: 6. 2 14.0 0.0 0.0 crc32w w0, w0, w24 +# CHECK-NEXT: 7. 2 15.0 0.0 0.0 crc32x w0, w0, x25 +# CHECK-NEXT: 8. 2 16.0 0.0 0.0 crc32ch w0, w0, w16 +# CHECK-NEXT: 9. 2 16.5 0.0 0.0 crc32cw w0, w0, w23 +# CHECK-NEXT: 10. 2 17.0 0.0 0.0 crc32cx w0, w0, x5 +# CHECK-NEXT: 2 12.9 0.0 0.0 + +# CHECK: [18] Code Region - Z saba + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [0,2] D======eeeeER . . . . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [0,3] D==========eeeeER . . . saba z0.d, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [1,2] .D===================eeeeER . saba z0.d, z1.d, z2.d +# CHECK-NEXT: [1,3] .D=======================eeeeER saba z0.d, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 saba z0.d, z1.d, z2.d +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 saba z0.d, z1.d, z2.d +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 saba z0.d, z0.d, z1.d +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [19] Code Region - Z sabalt + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul z0.h, z0.h, z0.h +# CHECK-NEXT: [0,1] D====eeeeER . . . . sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: [0,2] D=====eeeeER . . . . sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeeER . . . sabalt z0.h, z0.b, z1.b +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul z0.h, z0.h, z0.h +# CHECK-NEXT: [1,1] .D================eeeeER . . sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: [1,2] .D=================eeeeER. . sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: [1,3] .D=====================eeeeER sabalt z0.h, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.h, z0.h, z0.h +# CHECK-NEXT: 1. 2 11.0 0.0 0.0 sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 sabalt z0.h, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sabalt z0.h, z0.b, z1.b +# CHECK-NEXT: 2 11.6 0.1 0.0 + +# CHECK: [20] Code Region - Z sadalp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [0,2] D======eeeeER . . . . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [0,3] D==========eeeeER . . . sadalp z0.d, p0/m, z0.s +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [1,2] .D===================eeeeER . sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: [1,3] .D=======================eeeeER sadalp z0.d, p0/m, z0.s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sadalp z0.d, p0/m, z1.s +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sadalp z0.d, p0/m, z0.s +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [21] Code Region - Z ssra + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [0,2] D======eeeeER . . . . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [0,3] D==========eeeeER . . . ssra z0.d, z0.d, #1 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [1,2] .D===================eeeeER . ssra z0.d, z1.d, #1 +# CHECK-NEXT: [1,3] .D=======================eeeeER ssra z0.d, z0.d, #1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 ssra z0.d, z1.d, #1 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 ssra z0.d, z1.d, #1 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 ssra z0.d, z0.d, #1 +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [22] Code Region - Z cdot.s + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [0,2] D======eeeER . . .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [0,3] D=========eeeER. . .. cdot z0.s, z0.b, z1.b, #90 +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [1,2] .D=================eeeER .. cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: [1,3] .D====================eeeER cdot z0.s, z0.b, z1.b, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 cdot z0.s, z1.b, z2.b, #90 +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 cdot z0.s, z0.b, z1.b, #90 +# CHECK-NEXT: 2 11.5 0.1 0.0 + +# CHECK: [23] Code Region - Z cdot.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [0,2] D======eeeeER . . . . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [0,3] D==========eeeeER . . . cdot z0.d, z0.h, z1.h, #90 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [1,2] .D===================eeeeER . cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: [1,3] .D=======================eeeeER cdot z0.d, z0.h, z1.h, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cdot z0.d, z1.h, z2.h, #90 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cdot z0.d, z0.h, z1.h, #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [24] Code Region - Z cmla.b + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [0,2] D======eeeeER . . . . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [0,3] D==========eeeeER . . . cmla z0.b, z0.b, z1.b, #90 +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [1,2] .D===================eeeeER . cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: [1,3] .D=======================eeeeER cmla z0.b, z0.b, z1.b, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 cmla z0.b, z1.b, z2.b, #90 +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 cmla z0.b, z0.b, z1.b, #90 +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [25] Code Region - Z cmla.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1803 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.28 +# CHECK-NEXT: IPC: 0.22 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012345678 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [0,2] D========eeeeeER . . . . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [0,3] D=============eeeeeER . . . . cmla z0.d, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] .D==============================eeeeeER cmla z0.d, z0.d, z1.d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 cmla z0.d, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 cmla z0.d, z0.d, z1.d, #90 +# CHECK-NEXT: 2 16.0 0.1 0.0 + +# CHECK: [26] Code Region - Z sdot.s + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b +# CHECK-NEXT: 2 11.5 0.1 0.0 + +# CHECK: [27] Code Region - Z sudot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: [1,0] .D===========eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] .D================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,2] .D=================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,3] .D====================eeeER sdot z0.s, z0.b, z1.b[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 3. 2 15.5 0.0 0.0 sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: 2 11.5 0.1 0.0 + +# CHECK: [28] Code Region - Z sdot.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,2] .D===================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=======================eeeeER sdot z0.d, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 sdot z0.d, z0.h, z1.h +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [29] Code Region - Z smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul z0.s, z0.s, z0.s +# CHECK-NEXT: [0,1] D====eeeER. . . . smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D=====eeeER . . . smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D========eeeER . . . smmla z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul z0.s, z0.s, z0.s +# CHECK-NEXT: [1,1] .D==============eeeER . smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] .D===============eeeER . smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] .D==================eeeER smmla z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul z0.s, z0.s, z0.s +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 11.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 14.0 0.0 0.0 smmla z0.s, z0.b, z1.b +# CHECK-NEXT: 2 10.4 0.1 0.0 + +# CHECK: [30] Code Region - Z mla.b + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeeER . . . . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [0,3] D==========eeeeER . . . mla z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [1,2] .D===================eeeeER . mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: [1,3] .D=======================eeeeER mla z0.b, p0/m, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 mla z0.b, p0/m, z1.b, z2.b +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 mla z0.b, p0/m, z0.b, z1.b +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [31] Code Region - Z mla.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1803 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.28 +# CHECK-NEXT: IPC: 0.22 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012345678 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D========eeeeeER . . . . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D=============eeeeeER . . . . mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D==============================eeeeeER mla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 16.0 0.1 0.0 + +# CHECK: [32] Code Region - Z smlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [0,2] D======eeeeER . . . . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [0,3] D==========eeeeER . . . smlalb z0.d, z0.s, z1.s +# CHECK-NEXT: [1,0] .D=============eeeeeER . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeER . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,2] .D===================eeeeER . smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,3] .D=======================eeeeER smlalb z0.d, z0.s, z1.s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.5 0.0 0.0 smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 smlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 3. 2 17.5 0.0 0.0 smlalb z0.d, z0.s, z1.s +# CHECK-NEXT: 2 12.8 0.1 0.0 + +# CHECK: [33] Code Region - Z sqdmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqdmlalb z0.d, z0.s, z1.s +# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===================eeeeER . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: [1,3] .D=========================eeeeER sqdmlalb z0.d, z0.s, z1.s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqdmlalb z0.d, z1.s, z2.s +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqdmlalb z0.d, z0.s, z1.s +# CHECK-NEXT: 2 13.8 0.1 0.0 + +# CHECK: [34] Code Region - Z sqrdmlah.b + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.33 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [0,2] D=======eeeeER . . . . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [0,3] D===========eeeeER . . . . sqrdmlah z0.b, z0.b, z1.b +# CHECK-NEXT: [1,0] .D==============eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===================eeeeER . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [1,2] .D=====================eeeeER . . sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: [1,3] .D=========================eeeeER sqrdmlah z0.b, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 sqrdmlah z0.b, z1.b, z2.b +# CHECK-NEXT: 3. 2 19.0 0.0 0.0 sqrdmlah z0.b, z0.b, z1.b +# CHECK-NEXT: 2 13.8 0.1 0.0 + +# CHECK: [35] Code Region - Z sqrdmlah.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1803 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.28 +# CHECK-NEXT: IPC: 0.22 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012345678 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [0,2] D========eeeeeER . . . . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [0,3] D=============eeeeeER . . . . sqrdmlah z0.d, z0.d, z1.d +# CHECK-NEXT: [1,0] .D=================eeeeeER . . . mul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D======================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [1,2] .D=========================eeeeeER . . sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: [1,3] .D==============================eeeeeER sqrdmlah z0.d, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: 2. 2 17.5 0.0 0.0 sqrdmlah z0.d, z1.d, z2.d +# CHECK-NEXT: 3. 2 22.5 0.0 0.0 sqrdmlah z0.d, z0.d, z1.d +# CHECK-NEXT: 2 16.0 0.1 0.0 + +# CHECK: [36] Code Region - Z fcmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [37] Code Region - Z fcmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,2] .D=================eeeeER. . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,3] .D=====================eeeeER fcmla z0.s, z0.s, z1.s[1], #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [38] Code Region - Z fmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [39] Code Region - Z fmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,3] .D=====================eeeeER fmla z0.d, z0.d, z1.d[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [40] Code Region - Z fmlalb ZZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] .D=================eeeeER. . fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=====================eeeeER fmlalb z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 fmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 fmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [41] Code Region - Z bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] .D=================eeeeER. . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=====================eeeeER bfdot z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.4 0.1 0.0 + +# CHECK: [42] Code Region - Z bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D==================eeeeeER . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] .D=====================eeeeeER. . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D==========================eeeeeER bfmmla z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 19.5 0.0 0.0 bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: 2 13.6 0.1 0.0 + +# CHECK: [43] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 5 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] .D===============eeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] .D=================eeeeER. . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] .D=====================eeeeER bfmlalb z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.0 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.4 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s index aa565f9aded26..e7160e02c7c7f 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-sve-instructions.s @@ -3685,21 +3685,21 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 2 0.50 clz z31.h, p7/m, z31.h # CHECK-NEXT: 1 2 0.50 clz z31.s, p7/m, z31.s # CHECK-NEXT: 1 4 1.00 cmla z0.b, z1.b, z2.b, #0 -# CHECK-NEXT: 2 5 2.00 cmla z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: 1 5 2.00 cmla z0.d, z1.d, z2.d, #0 # CHECK-NEXT: 1 4 1.00 cmla z0.h, z1.h, z2.h, #0 # CHECK-NEXT: 1 4 1.00 cmla z0.h, z1.h, z2.h[0], #0 # CHECK-NEXT: 1 4 1.00 cmla z0.s, z1.s, z2.s, #0 # CHECK-NEXT: 1 4 1.00 cmla z0.s, z1.s, z2.s[0], #0 # CHECK-NEXT: 1 4 1.00 cmla z15.b, z16.b, z17.b, #270 -# CHECK-NEXT: 2 5 2.00 cmla z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: 1 5 2.00 cmla z15.d, z16.d, z17.d, #270 # CHECK-NEXT: 1 4 1.00 cmla z15.h, z16.h, z17.h, #270 # CHECK-NEXT: 1 4 1.00 cmla z15.s, z16.s, z17.s, #270 # CHECK-NEXT: 1 4 1.00 cmla z29.b, z30.b, z31.b, #90 -# CHECK-NEXT: 2 5 2.00 cmla z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: 1 5 2.00 cmla z29.d, z30.d, z31.d, #90 # CHECK-NEXT: 1 4 1.00 cmla z29.h, z30.h, z31.h, #90 # CHECK-NEXT: 1 4 1.00 cmla z29.s, z30.s, z31.s, #90 # CHECK-NEXT: 1 4 1.00 cmla z31.b, z31.b, z31.b, #180 -# CHECK-NEXT: 2 5 2.00 cmla z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: 1 5 2.00 cmla z31.d, z31.d, z31.d, #180 # CHECK-NEXT: 1 4 1.00 cmla z31.h, z30.h, z7.h[0], #180 # CHECK-NEXT: 1 4 1.00 cmla z31.h, z31.h, z31.h, #180 # CHECK-NEXT: 1 4 1.00 cmla z31.s, z30.s, z7.s[0], #180 @@ -4938,7 +4938,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 2 1.00 lsrr z0.h, p0/m, z0.h, z0.h # CHECK-NEXT: 1 2 1.00 lsrr z0.s, p0/m, z0.s, z0.s # CHECK-NEXT: 1 4 1.00 mad z0.b, p7/m, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 mad z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 mad z0.d, p7/m, z1.d, z31.d # CHECK-NEXT: 1 4 1.00 mad z0.h, p7/m, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 mad z0.s, p7/m, z1.s, z31.s # CHECK-NEXT: 1 2 0.50 match p0.b, p0/z, z0.b, z0.b @@ -4946,15 +4946,15 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 2 0.50 match p15.b, p7/z, z30.b, z31.b # CHECK-NEXT: 1 2 0.50 match p15.h, p7/z, z30.h, z31.h # CHECK-NEXT: 1 4 1.00 mla z0.b, p7/m, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 mla z0.d, p7/m, z1.d, z31.d -# CHECK-NEXT: 2 5 2.00 mla z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 5 2.00 mla z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 mla z0.d, z1.d, z7.d[1] # CHECK-NEXT: 1 4 1.00 mla z0.h, p7/m, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 mla z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 1.00 mla z0.s, p7/m, z1.s, z31.s # CHECK-NEXT: 1 4 1.00 mla z0.s, z1.s, z7.s[3] # CHECK-NEXT: 1 4 1.00 mls z0.b, p7/m, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 mls z0.d, p7/m, z1.d, z31.d -# CHECK-NEXT: 2 5 2.00 mls z0.d, z1.d, z7.d[1] +# CHECK-NEXT: 1 5 2.00 mls z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 mls z0.d, z1.d, z7.d[1] # CHECK-NEXT: 1 4 1.00 mls z0.h, p7/m, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 mls z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 1.00 mls z0.s, p7/m, z1.s, z31.s @@ -5072,7 +5072,7 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 1 0.25 U mrs x3, ZCR_EL2 # CHECK-NEXT: 1 1 0.25 U mrs x3, ZCR_EL3 # CHECK-NEXT: 1 4 1.00 msb z0.b, p7/m, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 msb z0.d, p7/m, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 msb z0.d, p7/m, z1.d, z31.d # CHECK-NEXT: 1 4 1.00 msb z0.h, p7/m, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 msb z0.s, p7/m, z1.s, z31.s # CHECK-NEXT: 1 1 0.25 U msr ZCR_EL1, x3 @@ -5790,35 +5790,35 @@ zip2 z31.s, z31.s, z31.s # CHECK-NEXT: 1 2 0.50 sqneg z31.h, p7/m, z31.h # CHECK-NEXT: 1 2 0.50 sqneg z31.s, p7/m, z31.s # CHECK-NEXT: 1 4 1.00 sqrdcmlah z0.b, z1.b, z2.b, #0 -# CHECK-NEXT: 2 5 2.00 sqrdcmlah z0.d, z1.d, z2.d, #0 +# CHECK-NEXT: 1 5 2.00 sqrdcmlah z0.d, z1.d, z2.d, #0 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z0.h, z1.h, z2.h, #0 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z0.h, z1.h, z2.h[0], #0 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z0.s, z1.s, z2.s, #0 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z0.s, z1.s, z2.s[0], #0 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z15.b, z16.b, z17.b, #270 -# CHECK-NEXT: 2 5 2.00 sqrdcmlah z15.d, z16.d, z17.d, #270 +# CHECK-NEXT: 1 5 2.00 sqrdcmlah z15.d, z16.d, z17.d, #270 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z15.h, z16.h, z17.h, #270 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z15.s, z16.s, z17.s, #270 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z29.b, z30.b, z31.b, #90 -# CHECK-NEXT: 2 5 2.00 sqrdcmlah z29.d, z30.d, z31.d, #90 +# CHECK-NEXT: 1 5 2.00 sqrdcmlah z29.d, z30.d, z31.d, #90 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z29.h, z30.h, z31.h, #90 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z29.s, z30.s, z31.s, #90 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z31.b, z31.b, z31.b, #180 -# CHECK-NEXT: 2 5 2.00 sqrdcmlah z31.d, z31.d, z31.d, #180 +# CHECK-NEXT: 1 5 2.00 sqrdcmlah z31.d, z31.d, z31.d, #180 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z31.h, z30.h, z7.h[0], #180 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z31.h, z31.h, z31.h, #180 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z31.s, z30.s, z7.s[0], #180 # CHECK-NEXT: 1 4 1.00 sqrdcmlah z31.s, z31.s, z31.s, #180 # CHECK-NEXT: 1 4 1.00 sqrdmlah z0.b, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 sqrdmlah z0.d, z1.d, z15.d[1] -# CHECK-NEXT: 2 5 2.00 sqrdmlah z0.d, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 sqrdmlah z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 5 2.00 sqrdmlah z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 1.00 sqrdmlah z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 sqrdmlah z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 1.00 sqrdmlah z0.s, z1.s, z31.s # CHECK-NEXT: 1 4 1.00 sqrdmlah z0.s, z1.s, z7.s[3] # CHECK-NEXT: 1 4 1.00 sqrdmlsh z0.b, z1.b, z31.b -# CHECK-NEXT: 2 5 2.00 sqrdmlsh z0.d, z1.d, z15.d[1] -# CHECK-NEXT: 2 5 2.00 sqrdmlsh z0.d, z1.d, z31.d +# CHECK-NEXT: 1 5 2.00 sqrdmlsh z0.d, z1.d, z15.d[1] +# CHECK-NEXT: 1 5 2.00 sqrdmlsh z0.d, z1.d, z31.d # CHECK-NEXT: 1 4 1.00 sqrdmlsh z0.h, z1.h, z31.h # CHECK-NEXT: 1 4 1.00 sqrdmlsh z0.h, z1.h, z7.h[7] # CHECK-NEXT: 1 4 1.00 sqrdmlsh z0.s, z1.s, z31.s