Skip to content

Conversation

@Asher8118
Copy link
Contributor

Introduce a description of late forwarding to the Neoverse-N3 scheduling model.

@llvmbot
Copy link
Member

llvmbot commented Nov 10, 2025

@llvm/pr-subscribers-backend-aarch64

Author: Asher Dobrescu (Asher8118)

Changes

Introduce a description of late forwarding to the Neoverse-N3 scheduling model.


Patch is 109.42 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167302.diff

2 Files Affected:

  • (modified) llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td (+159-46)
  • (added) llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s (+2034)
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index 411b372a3f533..d3705b932bf62 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0,
     let NumMicroOps = 16;
 }
 
+//===----------------------------------------------------------------------===//
+// Define forwarded types
+// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for
+// consumers of 64 bit multiply high operations?
+
+def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>;
+
+def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>;
+
+def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>;
+
+def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>;
+
+def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>;
+
+def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>;
+
+def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>;
+
+def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>;
+
+def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>;
+
+def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>;
+
+def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>;
+
+def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>;
+
+def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>;
+
+def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>;
+
+def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; }
+def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>;
+
+def N3Wr_ZA  : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZA  : SchedReadAdvance<3, [N3Wr_ZA]>;
+def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>;
+def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
+def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>;
+
+def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]>   { let Latency = 3; }
+def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>;
+def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>;
+
+def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>;
+def N3Wr_ZCMAD   : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZCMAD   : SchedReadAdvance<2, [N3Wr_ZCMAD]>;
+
+def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
+def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>;
+
+def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>;
+def N3Wr_ZMAD  : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMAD  : SchedReadAdvance<2, [N3Wr_ZMAD]>;
+
+def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
+def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>;
+
+def N3Wr_ZMASQL   : SchedWriteRes<[N3UnitV0]>            { let Latency = 4; }
+def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]>            { let Latency = 4; }
+def N3Wr_ZMASQD   : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
+def N3Rd_ZMASQ    : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS,
+                                        N3Wr_ZMASQD]>;
+
+def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>;
+
+def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>;
+
+def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>;
+
+def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>;
+def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
+def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>;
+def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
+def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>;
+
 // Miscellaneous
 // -----------------------------------------------------------------------------
 
@@ -832,10 +933,11 @@ def : SchedAlias<WriteFDiv , N3Write_7c_1V0>;
 def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>;
 
 // FP multiply
-def : SchedAlias<WriteFMul, N3Write_3c_1V>;
+def : WriteRes<WriteFMul, [N3UnitV]> { let Latency = 3; }
 
 // FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
+def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA],
+             (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
 
 // FP round to integral
 def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>;
@@ -969,7 +1071,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
 // ASIMD absolute diff accum long
 // ASIMD pairwise add and accumulate long
 // ASIMD shift accumulate
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v",
+def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
                                           "^[SU]ADALPv",
                                           "^[SU]R?SRAv")>;
 
@@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>;
 
 // ASIMD dot product
 // ASIMD dot product using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
+def : InstRW<[N3Wr_VDOT, N3Rd_VDOT],
+             (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
 
 // ASIMD matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
+def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
 
 // ASIMD max/min, reduce, 4H/4S
 def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>;
@@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
 def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
 
 // ASIMD multiply accumulate
-def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>;
+def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
 
 // ASIMD multiply accumulate high
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
+def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
+def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
 
 // ASIMD multiply accumulate saturating long
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
+def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;
 
 // ASIMD multiply/multiply long (8x8) polynomial, D-form
 // ASIMD multiply/multiply long (8x8) polynomial, Q-form
@@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1],
 def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>;
 
 // ASIMD FP complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>;
+def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>;
 
 // ASIMD FP convert, long (F16 to F32)
 def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
@@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
 def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
 
 // ASIMD FP multiply
-def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;
 
 // ASIMD FP multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;
 
 // ASIMD FP multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>;
+def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>;
 
 // ASIMD FP round, D-form F32 and Q-form F64
 def : InstRW<[N3Write_3c_1V0],
@@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>;
 def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
 
 // ASIMD dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
+def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
 
 // ASIMD matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>;
+def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>;
 
 // ASIMD multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
+def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA],
+             (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
 
 // Scalar convert, F32 to BF16
 def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>;
@@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
 // -----------------------------------------------------------------------------
 
 // CRC checksum ops
-def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>;
+def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>;
 
 // SVE Predicate instructions
 // -----------------------------------------------------------------------------
@@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
                                          "^[SU]ABD_ZPZZ_[BHSD]")>;
 
 // Arithmetic, absolute diff accum
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
 
 // Arithmetic, absolute diff accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
 
 // Arithmetic, absolute diff long
 def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
@@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
 def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;
 
 // Arithmetic, pairwise add and accum long
-def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
+def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA],
+             (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
 
 // Arithmetic, shift
 def : InstRW<[N3Write_2c_1V1],
@@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1],
                         "^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
 
 // Arithmetic, shift and accumulate
-def : InstRW<[N3Write_4c_1V1],
+def : InstRW<[N3Wr_ZSA, N3Rd_ZSA],
              (instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;
 
 // Arithmetic, shift by immediate
@@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V],
 def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;
 
 // Complex dot product 8-bit element
-def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
 
 // Complex dot product 16-bit element
-def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
 
 // Complex multiply-add B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS],
+             (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
 
 // Complex multiply-add D element size
-def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>;
+def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
 
 // Conditional extract operations, scalar form
 def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
@@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D",
                                             "^[SU]DIV_ZPZZ_D")>;
 
 // Dot product, 8 bit
-def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
 
 // Dot product, 8 bit, using signed and unsigned integers
-def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB],
+             (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
 
 // Dot product, 16 bit
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
+def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
 
 // Duplicate, immediate and indexed form
 def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$",
@@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
 def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>;
 
 // Matrix multiply-accumulate
-def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
+def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
 
 // Move prefix
 def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
@@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
                                           "^[SU]MULL[BT]_ZZZ_[HSD]$")>;
 
 // Multiply accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
-                                          "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
+def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS],
+             (instregex "^ML[AS]_ZZZI_[BHS]$",
+                        "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
 
 // Multiply accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$",
+def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$",
                                           "^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
 
 // Multiply accumulate long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
+def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
                                           "^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;
 
 // Multiply accumulate saturating doubling long regular
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
-                                          "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
+def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ],
+            (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
+                       "^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
 
 // Multiply saturating doubling high, B, H, S element size
 def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
@@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
                                           "^SQDMULL[BT]_ZZZI_[SD]$")>;
 
 // Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size
-def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
+def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
                                           "^SQRDCMLAH_ZZZ_[BHS]$",
                                           "^SQRDML[AS]H_ZZZI_[HS]$",
                                           "^SQRDCMLAH_ZZZI_[HS]$")>;
 
 // Multiply saturating rounding doubling regular/complex accumulate, D element size
-def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
+def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$",
                                           "^SQRDCMLAH_ZZZ_D$")>;
 
 // Multiply saturating rounding doubling regular/complex, B, H, S element size
@@ -1949,8 +2058,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
 def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;
 
 // Floating point complex multiply add
-def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
-                                         "^FCMLA_ZZZI_[HS]$")>;
+def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA],
+             (instregex "^FCMLA_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
 
 // Floating point convert, long or narrow (F16 to F32 or F32 to F16)
 def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
@@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
                                          "^FMUL_ZPZ[IZ]_[HSD]")>;
 
 // Floating point multiply accumulate
-def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
-                                         "^FN?ML[AS]_ZPZZZ_[HSD]",
-                                         "^FML[AS]_ZZZI_[HSD]$")>;
+def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA],
+             (instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
+                        "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
+def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA],
+             (instregex "^FML[AS]_ZZZI_[HSD]",
+                        "^FN?ML[AS]_ZPZZZ_[HSD]")>;
 
 // Floating point multiply add/sub accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
+def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
 
 // Floating point reciprocal estimate, F16
 def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
@@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
 def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
 
 // Dot product
-def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
+def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
 
 // Matrix multiply accumulate
-def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>;
+def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
 
 // Multiply accumulate long
-def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
+def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
 
 // SVE Load instructions
 // -----------------------------------------------------------------------------
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
new file mode 100644
index 0000000000000..f6b9db13624b6
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-forwarding.s
@@ -0,0 +1,2034 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-n3 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s
+
+# LLVM-MCA-BEGIN madd
+mul  x0, x0, x0
+madd x0, x1, x2, x0
+madd x0, x1, x2, x0
+madd x0, x0, x0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smaddl
+mul    x0, x0, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w1, w2, x0
+smaddl x0, w0, w0, x0
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN fmadd
+fadd  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmul  d0, d0, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d1, d2, d0
+fmadd d0, d0, d1, d2
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN saba
+mul  v0.4s, v0.4s, v0.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v1.4s, v2.4s
+saba v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sdot
+mul  v0.4s, v0.4s,  v0.4s
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v1.16b, v2.16b
+sdot v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smmla
+mul   v0.4s, v0.4s,  v0.4s
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v1.16b, v2.16b
+smmla v0.4s, v0.16b, v1.16b
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN mla
+mul v0.4s, v0.4s, v0.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v1.4s, v2.4s
+mla v0.4s, v0.4s, v1.4s
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN sqrdmlah
+mul    v0.4s, v0.4s, v0.4s
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+sqrdmlah v0.8h, v1.8h, v2.8h
+# LLVM-MCA-END
+
+# LLVM-MCA-BEGIN smlal2
+mul    v0.4s, v0.4s, v0.4s
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v1.8h, v2.8h
+smlal2 v0.4s, v0.8h,...
[truncated]

Ash Dobrescu added 3 commits November 13, 2025 10:39
@Asher8118 Asher8118 merged commit ec490b1 into llvm:main Nov 13, 2025
10 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants