Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 159 additions & 46 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0,
let NumMicroOps = 16;
}

//===----------------------------------------------------------------------===//
// Define forwarded types
// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for
// consumers of 64 bit multiply high operations?

def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>;

def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>;

def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>;

def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>;

def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>;

def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>;

def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>;

def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>;

def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>;

def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>;

def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>;

def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>;

def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>;

def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>;

def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; }
def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>;

def N3Wr_ZA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
def N3Rd_ZA : SchedReadAdvance<3, [N3Wr_ZA]>;
def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>;
def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>;

def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>;
def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>;

def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>;
def N3Wr_ZCMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
def N3Rd_ZCMAD : SchedReadAdvance<2, [N3Wr_ZCMAD]>;

def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>;

def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>;
def N3Wr_ZMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
def N3Rd_ZMAD : SchedReadAdvance<2, [N3Wr_ZMAD]>;

def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>;

def N3Wr_ZMASQL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
def N3Wr_ZMASQD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
def N3Rd_ZMASQ : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS,
N3Wr_ZMASQD]>;

def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>;

def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>;

def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>;

def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>;
def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>;
def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>;

// Miscellaneous
// -----------------------------------------------------------------------------

Expand Down Expand Up @@ -832,10 +933,11 @@ def : SchedAlias<WriteFDiv , N3Write_7c_1V0>;
def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>;

// FP multiply
def : SchedAlias<WriteFMul, N3Write_3c_1V>;
def : WriteRes<WriteFMul, [N3UnitV]> { let Latency = 3; }

// FP multiply accumulate
def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA],
(instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;

// FP round to integral
def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>;
Expand Down Expand Up @@ -969,7 +1071,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
// ASIMD absolute diff accum long
// ASIMD pairwise add and accumulate long
// ASIMD shift accumulate
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v",
def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
"^[SU]ADALPv",
"^[SU]R?SRAv")>;

Expand All @@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>;

// ASIMD dot product
// ASIMD dot product using signed and unsigned integers
def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
def : InstRW<[N3Wr_VDOT, N3Rd_VDOT],
(instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;

// ASIMD matrix multiply-accumulate
def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;

// ASIMD max/min, reduce, 4H/4S
def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>;
Expand All @@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;

// ASIMD multiply accumulate
def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>;
def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>;

// ASIMD multiply accumulate high
def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;

// ASIMD multiply accumulate long
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;

// ASIMD multiply accumulate saturating long
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;

// ASIMD multiply/multiply long (8x8) polynomial, D-form
// ASIMD multiply/multiply long (8x8) polynomial, Q-form
Expand Down Expand Up @@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1],
def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>;

// ASIMD FP complex multiply add
def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>;
def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>;

// ASIMD FP convert, long (F16 to F32)
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
Expand Down Expand Up @@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;

// ASIMD FP multiply
def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>;
def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;

// ASIMD FP multiply accumulate
def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>;
def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;

// ASIMD FP multiply accumulate long
def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>;
def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>;

// ASIMD FP round, D-form F32 and Q-form F64
def : InstRW<[N3Write_3c_1V0],
Expand Down Expand Up @@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>;
def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>;

// ASIMD dot product
def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>;

// ASIMD matrix multiply accumulate
def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>;
def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>;

// ASIMD multiply accumulate long
def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA],
(instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;

// Scalar convert, F32 to BF16
def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>;
Expand Down Expand Up @@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
// -----------------------------------------------------------------------------

// CRC checksum ops
def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>;
def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>;

// SVE Predicate instructions
// -----------------------------------------------------------------------------
Expand Down Expand Up @@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
"^[SU]ABD_ZPZZ_[BHSD]")>;

// Arithmetic, absolute diff accum
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;

// Arithmetic, absolute diff accum long
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;

// Arithmetic, absolute diff long
def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
Expand Down Expand Up @@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;

// Arithmetic, pairwise add and accum long
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA],
(instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;

// Arithmetic, shift
def : InstRW<[N3Write_2c_1V1],
Expand All @@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1],
"^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;

// Arithmetic, shift and accumulate
def : InstRW<[N3Write_4c_1V1],
def : InstRW<[N3Wr_ZSA, N3Rd_ZSA],
(instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;

// Arithmetic, shift by immediate
Expand Down Expand Up @@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V],
def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;

// Complex dot product 8-bit element
def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;

// Complex dot product 16-bit element
def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;

// Complex multiply-add B, H, S element size
def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS],
(instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;

// Complex multiply-add D element size
def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>;
def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;

// Conditional extract operations, scalar form
def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
Expand Down Expand Up @@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D",
"^[SU]DIV_ZPZZ_D")>;

// Dot product, 8 bit
def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;

// Dot product, 8 bit, using signed and unsigned integers
def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB],
(instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;

// Dot product, 16 bit
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;

// Duplicate, immediate and indexed form
def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$",
Expand Down Expand Up @@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>;

// Matrix multiply-accumulate
def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;

// Move prefix
def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
Expand All @@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
"^[SU]MULL[BT]_ZZZ_[HSD]$")>;

// Multiply accumulate, B, H, S element size
def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS],
(instregex "^ML[AS]_ZZZI_[BHS]$",
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;

// Multiply accumulate, D element size
def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$",
def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$",
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;

// Multiply accumulate long
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
"^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;

// Multiply accumulate saturating doubling long regular
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
"^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ],
(instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
"^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;

// Multiply saturating doubling high, B, H, S element size
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
Expand All @@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
"^SQDMULL[BT]_ZZZI_[SD]$")>;

// Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size
def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
"^SQRDCMLAH_ZZZ_[BHS]$",
"^SQRDML[AS]H_ZZZI_[HS]$",
"^SQRDCMLAH_ZZZI_[HS]$")>;

// Multiply saturating rounding doubling regular/complex accumulate, D element size
def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$",
"^SQRDCMLAH_ZZZ_D$")>;

// Multiply saturating rounding doubling regular/complex, B, H, S element size
Expand Down Expand Up @@ -1948,8 +2057,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;

// Floating point complex multiply add
def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
"^FCMLA_ZZZI_[HS]$")>;
def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA],
(instregex "^FCMLA_ZPmZZ_[HSD]")>;
def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;

// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
Expand Down Expand Up @@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
"^FMUL_ZPZ[IZ]_[HSD]")>;

// Floating point multiply accumulate
def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
"^FN?ML[AS]_ZPZZZ_[HSD]",
"^FML[AS]_ZZZI_[HSD]$")>;
def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA],
(instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
"^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA],
(instregex "^FML[AS]_ZZZI_[HSD]",
"^FN?ML[AS]_ZPZZZ_[HSD]")>;

// Floating point multiply add/sub accumulate long
def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;

// Floating point reciprocal estimate, F16
def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
Expand Down Expand Up @@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;

// Dot product
def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;

// Matrix multiply accumulate
def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>;
def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;

// Multiply accumulate long
def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;

// SVE Load instructions
// -----------------------------------------------------------------------------
Expand Down
Loading
Loading