-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AArch64] Fix metrics of ASIMD instructions in Neoverse N3 #169790
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-aarch64 Author: Asher Dobrescu (Asher8118) ChangesSome ASIMD instructions in the Neoverse N3 scheduler model seem to have been missed and have default definitions, which give them incorrect latency and throughput. This patch fixes such instructions to match the current N3 SWOG. Patch is 85.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169790.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
index c73f60a1a7741..beeadd4403605 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td
@@ -1073,7 +1073,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
// ASIMD shift accumulate
def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
"^[SU]ADALPv",
- "^[SU]R?SRAv")>;
+ "^[SU]R?SRA(v|d)")>;
// ASIMD arith, reduce, 4H/4S
def : InstRW<[N3Write_3c_1V1], (instregex "^[SU]?ADDL?Vv4i(16|32)v$")>;
@@ -1114,30 +1114,30 @@ def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
// ASIMD multiply accumulate saturating long
-def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;
+def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLAL(v|i16|i32)", "^SQDMLSL(v|i16|i32)")>;
// ASIMD multiply/multiply long (8x8) polynomial, D-form
// ASIMD multiply/multiply long (8x8) polynomial, Q-form
def : InstRW<[N3Write_2c_1V0], (instregex "^PMULL?(v8i8|v16i8)$")>;
// ASIMD multiply long
-def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULLv", "^SQDMULLv")>;
+def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULLv", "^SQDMULL(v|i16|i32)")>;
// ASIMD shift by immed, basic
-def : InstRW<[N3Write_2c_1V1], (instregex "^SHLv", "^SHLLv", "^SHRNv",
- "^SSHLLv", "^SSHRv", "^USHLLv",
- "^USHRv")>;
+def : InstRW<[N3Write_2c_1V1], (instregex "^SHL(v|d)", "^SHLLv", "^SHRNv",
+ "^SSHLLv", "^SSHR(v|d)", "^USHLLv",
+ "^USHR(v|d)")>;
// ASIMD shift by immed and insert, basic
-def : InstRW<[N3Write_2c_1V1], (instregex "^SLIv", "^SRIv")>;
+def : InstRW<[N3Write_2c_1V1], (instregex "^SLI(v|d)", "^SRI(v|d)")>;
// ASIMD shift by immed, complex
def : InstRW<[N3Write_4c_1V1],
- (instregex "^RSHRNv", "^SQRSHRNv", "^SQRSHRUNv",
+ (instregex "^RSHRNv", "^SQRSHRN(v|b|h|s)", "^SQRSHRUN(v|b|h|s)",
"^(SQSHLU?|UQSHL)[bhsd]$",
"^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$",
- "^SQSHRNv", "^SQSHRUNv", "^SRSHRv", "^UQRSHRNv",
- "^UQSHRNv", "^URSHRv")>;
+ "^SQSHRN(v|b|h|s)", "^SQSHRUN(v|b|h|s)", "^SRSHR(v|d)",
+ "^UQRSHRN(v|b|h|s)", "^UQSHRN(v|b|h|s)","^URSHR(v|d)")>;
// ASIMD shift by register, basic
def : InstRW<[N3Write_2c_1V1], (instregex "^[SU]SHLv")>;
@@ -1173,16 +1173,16 @@ def : InstRW<[N3Write_3c_1V0], (instregex "^FCVTL(v2|v4)i32")>;
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTN(v4|v8)i16")>;
// ASIMD FP convert, narrow (F64 to F32)
-def : InstRW<[N3Write_3c_1V0], (instregex "^FCVTN(v2|v4)i32",
+def : InstRW<[N3Write_3c_1V0], (instregex "^FCVTN(v2|v4)i32", "^FCVTXNv1i64",
"^FCVTXN(v2|v4)f32")>;
// ASIMD FP convert, other, D-form F32 and Q-form F64
-def : InstRW<[N3Write_3c_1V0], (instregex "^[FSU]CVT[AMNPZ][SU]v2f(32|64)$",
- "^[SU]CVTFv2f(32|64)$")>;
+def : InstRW<[N3Write_3c_1V0], (instregex "^[FSU]CVT[AMNPZ][SU](v2f(32|64)|s|d|v1i32|v1i64|v2i32_shift|v2i64_shift)$",
+ "^[SU]CVTF(v2f(32|64)|s|d|v1i32|v1i64|v2i32_shift|v2i64_shift)$")>;
// ASIMD FP convert, other, D-form F16 and Q-form F32
-def : InstRW<[N3Write_4c_2V0], (instregex "^[FSU]CVT[AMNPZ][SU]v4f(16|32)$",
- "^[SU]CVTFv4f(16|32)$")>;
+def : InstRW<[N3Write_4c_2V0], (instregex "^[FSU]CVT[AMNPZ][SU](v4f(16|32)|v4i(16|32)_shift)$",
+ "^[SU]CVTF(v4f(16|32)|v4i(16|32)_shift)$")>;
// ASIMD FP convert, other, Q-form F16
def : InstRW<[N3Write_6c_4V0], (instregex "^[FSU]CVT[AMNPZ][SU]v8f16$",
@@ -1217,7 +1217,7 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
// ASIMD FP multiply
-def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULX(v|32|64)")>;
// ASIMD FP multiply accumulate
def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;
@@ -1305,9 +1305,9 @@ def : InstRW<[N3Write_4c_2V0], (instrs URECPEv4i32, URSQRTEv4i32)>;
// ASIMD reciprocal and square root estimate, D-form F32 and scalar forms
def : InstRW<[N3Write_3c_1V0], (instrs FRECPEv1f16, FRECPEv1i32,
- FRECPEv1i64, FRECPEv2f32,
+ FRECPEv1i64, FRECPEv2f32, FRECPEv2f64,
FRSQRTEv1f16, FRSQRTEv1i32,
- FRSQRTEv1i64, FRSQRTEv2f32)>;
+ FRSQRTEv1i64, FRSQRTEv2f32, FRSQRTEv2f64)>;
// ASIMD reciprocal and square root estimate, D-form F16 and Q-form F32
def : InstRW<[N3Write_4c_2V0], (instrs FRECPEv4f16, FRECPEv4f32,
@@ -1320,7 +1320,7 @@ def : InstRW<[N3Write_6c_4V0], (instrs FRECPEv8f16, FRSQRTEv8f16)>;
def : InstRW<[N3Write_3c_1V0], (instregex "^FRECPXv")>;
// ASIMD reciprocal step
-def : InstRW<[N3Write_4c_1V], (instregex "^FRECPSv", "^FRSQRTSv")>;
+def : InstRW<[N3Write_4c_1V], (instregex "^FRECPS(v|32|64)", "^FRSQRTS(v|32|64)")>;
// ASIMD table lookup, 3 table regs
def : InstRW<[N3Write_4c_2V], (instrs TBLv8i8Three, TBLv16i8Three)>;
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s
index dddaca34f68dd..da8c0c5154cdc 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-neon-instructions.s
@@ -1189,15 +1189,15 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 2 0.50 fcmlt d20, d21, #0.0
# CHECK-NEXT: 1 2 0.50 fcmlt s10, s11, #0.0
# CHECK-NEXT: 1 2 0.50 fcmlt v0.4s, v0.4s, #0.0
-# CHECK-NEXT: 1 2 0.50 fcvtas d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtas s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtas d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtas s12, s13
# CHECK-NEXT: 1 3 1.00 fcvtas v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtas v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtas v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtas v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtas v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtau d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtau s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtau d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtau s12, s13
# CHECK-NEXT: 1 3 1.00 fcvtau v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtau v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtau v0.4h, v0.4h
@@ -1207,15 +1207,15 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 2 4 2.00 fcvtl v0.4s, v0.4h
# CHECK-NEXT: 1 3 1.00 fcvtl2 v0.2d, v0.4s
# CHECK-NEXT: 2 4 2.00 fcvtl2 v0.4s, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtms d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtms s22, s13
+# CHECK-NEXT: 1 3 1.00 fcvtms d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtms s22, s13
# CHECK-NEXT: 1 3 1.00 fcvtms v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtms v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtms v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtms v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtms v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtmu d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtmu s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtmu d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtmu s12, s13
# CHECK-NEXT: 1 3 1.00 fcvtmu v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtmu v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtmu v0.4h, v0.4h
@@ -1225,60 +1225,60 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 2 4 2.00 fcvtn v0.4h, v0.4s
# CHECK-NEXT: 1 3 1.00 fcvtn2 v0.4s, v0.2d
# CHECK-NEXT: 2 4 2.00 fcvtn2 v0.8h, v0.4s
-# CHECK-NEXT: 1 2 0.50 fcvtns d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtns s22, s13
+# CHECK-NEXT: 1 3 1.00 fcvtns d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtns s22, s13
# CHECK-NEXT: 1 3 1.00 fcvtns v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtns v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtns v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtns v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtns v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtnu d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtnu s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtnu d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtnu s12, s13
# CHECK-NEXT: 1 3 1.00 fcvtnu v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtnu v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtnu v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtnu v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtnu v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtps d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtps s22, s13
+# CHECK-NEXT: 1 3 1.00 fcvtps d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtps s22, s13
# CHECK-NEXT: 1 3 1.00 fcvtps v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtps v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtps v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtps v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtps v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtpu d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtpu s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtpu d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtpu s12, s13
# CHECK-NEXT: 1 3 1.00 fcvtpu v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtpu v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 fcvtpu v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtpu v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 fcvtpu v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtxn s22, d13
+# CHECK-NEXT: 1 3 1.00 fcvtxn s22, d13
# CHECK-NEXT: 1 3 1.00 fcvtxn v0.2s, v0.2d
# CHECK-NEXT: 1 3 1.00 fcvtxn2 v0.4s, v0.2d
-# CHECK-NEXT: 1 2 0.50 fcvtzs d21, d12, #1
-# CHECK-NEXT: 1 2 0.50 fcvtzs d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtzs s12, s13
-# CHECK-NEXT: 1 2 0.50 fcvtzs s21, s12, #1
+# CHECK-NEXT: 1 3 1.00 fcvtzs d21, d12, #1
+# CHECK-NEXT: 1 3 1.00 fcvtzs d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtzs s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtzs s21, s12, #1
# CHECK-NEXT: 1 3 1.00 fcvtzs v0.2d, v0.2d
-# CHECK-NEXT: 1 2 0.50 fcvtzs v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 1.00 fcvtzs v0.2d, v0.2d, #3
# CHECK-NEXT: 1 3 1.00 fcvtzs v0.2s, v0.2s
-# CHECK-NEXT: 1 2 0.50 fcvtzs v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 1.00 fcvtzs v0.2s, v0.2s, #3
# CHECK-NEXT: 2 4 2.00 fcvtzs v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtzs v0.4s, v0.4s
-# CHECK-NEXT: 1 2 0.50 fcvtzs v0.4s, v0.4s, #3
+# CHECK-NEXT: 2 4 2.00 fcvtzs v0.4s, v0.4s, #3
# CHECK-NEXT: 4 6 4.00 fcvtzs v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 fcvtzu d21, d12, #1
-# CHECK-NEXT: 1 2 0.50 fcvtzu d21, d14
-# CHECK-NEXT: 1 2 0.50 fcvtzu s12, s13
-# CHECK-NEXT: 1 2 0.50 fcvtzu s21, s12, #1
+# CHECK-NEXT: 1 3 1.00 fcvtzu d21, d12, #1
+# CHECK-NEXT: 1 3 1.00 fcvtzu d21, d14
+# CHECK-NEXT: 1 3 1.00 fcvtzu s12, s13
+# CHECK-NEXT: 1 3 1.00 fcvtzu s21, s12, #1
# CHECK-NEXT: 1 3 1.00 fcvtzu v0.2d, v0.2d
-# CHECK-NEXT: 1 2 0.50 fcvtzu v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 1.00 fcvtzu v0.2d, v0.2d, #3
# CHECK-NEXT: 1 3 1.00 fcvtzu v0.2s, v0.2s
-# CHECK-NEXT: 1 2 0.50 fcvtzu v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 1.00 fcvtzu v0.2s, v0.2s, #3
# CHECK-NEXT: 2 4 2.00 fcvtzu v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 fcvtzu v0.4s, v0.4s
-# CHECK-NEXT: 1 2 0.50 fcvtzu v0.4s, v0.4s, #3
+# CHECK-NEXT: 2 4 2.00 fcvtzu v0.4s, v0.4s, #3
# CHECK-NEXT: 4 6 4.00 fcvtzu v0.8h, v0.8h
# CHECK-NEXT: 2 8 2.00 fdiv v0.2s, v0.2s, v0.2s
# CHECK-NEXT: 1 2 0.50 fmax v0.2d, v0.2d, v0.2d
@@ -1318,8 +1318,8 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 3 0.50 fmul s0, s1, v0.s[3]
# CHECK-NEXT: 1 3 0.50 fmul v0.2s, v0.2s, v0.2s
# CHECK-NEXT: 1 3 0.50 fmulx d0, d4, v0.d[1]
-# CHECK-NEXT: 1 2 0.50 fmulx d23, d11, d1
-# CHECK-NEXT: 1 2 0.50 fmulx s20, s22, s15
+# CHECK-NEXT: 1 3 0.50 fmulx d23, d11, d1
+# CHECK-NEXT: 1 3 0.50 fmulx s20, s22, s15
# CHECK-NEXT: 1 3 0.50 fmulx s3, s5, v0.s[3]
# CHECK-NEXT: 1 3 0.50 fmulx v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 1 3 0.50 fmulx v0.2s, v0.2s, v0.2s
@@ -1331,14 +1331,14 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 2 0.50 fneg v0.8h, v0.8h
# CHECK-NEXT: 1 3 1.00 frecpe d13, d13
# CHECK-NEXT: 1 3 1.00 frecpe s19, s14
-# CHECK-NEXT: 1 2 0.50 frecpe v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 frecpe v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 frecpe v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 frecpe v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 frecpe v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 frecpe v0.8h, v0.8h
# CHECK-NEXT: 1 4 0.50 frecps v0.4s, v0.4s, v0.4s
-# CHECK-NEXT: 1 2 0.50 frecps d22, d30, d21
-# CHECK-NEXT: 1 2 0.50 frecps s21, s16, s13
+# CHECK-NEXT: 1 4 0.50 frecps d22, d30, d21
+# CHECK-NEXT: 1 4 0.50 frecps s21, s16, s13
# CHECK-NEXT: 1 3 1.00 frecpx d16, d19
# CHECK-NEXT: 1 3 1.00 frecpx s18, s10
# CHECK-NEXT: 1 3 1.00 frinta v0.2d, v0.2d
@@ -1378,13 +1378,13 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 4 6 4.00 frintz v0.8h, v0.8h
# CHECK-NEXT: 1 3 1.00 frsqrte d21, d12
# CHECK-NEXT: 1 3 1.00 frsqrte s22, s13
-# CHECK-NEXT: 1 2 0.50 frsqrte v0.2d, v0.2d
+# CHECK-NEXT: 1 3 1.00 frsqrte v0.2d, v0.2d
# CHECK-NEXT: 1 3 1.00 frsqrte v0.2s, v0.2s
# CHECK-NEXT: 2 4 2.00 frsqrte v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 frsqrte v0.4s, v0.4s
# CHECK-NEXT: 4 6 4.00 frsqrte v0.8h, v0.8h
-# CHECK-NEXT: 1 2 0.50 frsqrts d8, d22, d18
-# CHECK-NEXT: 1 2 0.50 frsqrts s21, s5, s12
+# CHECK-NEXT: 1 4 0.50 frsqrts d8, d22, d18
+# CHECK-NEXT: 1 4 0.50 frsqrts s21, s5, s12
# CHECK-NEXT: 1 4 0.50 frsqrts v0.2d, v0.2d, v0.2d
# CHECK-NEXT: 2 13 2.00 fsqrt v0.2d, v0.2d
# CHECK-NEXT: 2 8 2.00 fsqrt v0.2s, v0.2s
@@ -1543,20 +1543,20 @@ zip2 v0.8h, v0.8h, v0.8h
# CHECK-NEXT: 1 2 0.50 saddw2 v0.2d, v0.2d, v0.4s
# CHECK-NEXT: 1 2 0.50 saddw2 v0.4s, v0.4s, v0.8h
# CHECK-NEXT: 1 2 0.50 saddw2 v0.8h, v0.8h, v0.16b
-# CHECK-NEXT: 1 2 0.50 scvtf d21, d12
-# CHECK-NEXT: 1 2 0.50 scvtf d21, d12, #64
-# CHECK-NEXT: 1 2 0.50 scvtf s22, s13
-# CHECK-NEXT: 1 2 0.50 scvtf s22, s13, #32
+# CHECK-NEXT: 1 3 1.00 scvtf d21, d12
+# CHECK-NEXT: 1 3 1.00 scvtf d21, d12, #64
+# CHECK-NEXT: 1 3 1.00 scvtf s22, s13
+# CHECK-NEXT: 1 3 1.00 scvtf s22, s13, #32
# CHECK-NEXT: 1 3 1.00 scvtf v0.2d, v0.2d
-# CHECK-NEXT: 1 2 0.50 scvtf v0.2d, v0.2d, #3
+# CHECK-NEXT: 1 3 1.00 scvtf v0.2d, v0.2d, #3
# CHECK-NEXT: 1 3 1.00 scvtf v0.2s, v0.2s
-# CHECK-NEXT: 1 2 0.50 scvtf v0.2s, v0.2s, #3
+# CHECK-NEXT: 1 3 1.00 scvtf v0.2s, v0.2s, #3
# CHECK-NEXT: 2 4 2.00 scvtf v0.4h, v0.4h
# CHECK-NEXT: 2 4 2.00 scvt...
[truncated]
|
| // ASIMD reciprocal and square root estimate, D-form F32 and scalar forms | ||
| def : InstRW<[N3Write_3c_1V0], (instrs FRECPEv1f16, FRECPEv1i32, | ||
| FRECPEv1i64, FRECPEv2f32, | ||
| FRECPEv1i64, FRECPEv2f32, FRECPEv2f64, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
While this is D-form, it is not F32, but F64. I could not find a category that fits this instruction, but this felt like the closest group it could belong to.
c-rhodes
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
one minor comment, but otherwise LGTM cheers
🐧 Linux x64 Test Results
|
Some ASIMD instructions in the Neoverse N3 scheduler model seem to have been missed and have default definitions, which give them incorrect latency and throughput. This patch fixes such instructions to match the current N3 SWOG.
Some ASIMD instructions in the Neoverse N3 scheduler model seem to have been missed and have default definitions, which give them incorrect latency and throughput. This patch fixes such instructions to match the current N3 SWOG.
Some ASIMD instructions in the Neoverse N3 scheduler model seem to have been missed and have default definitions, which give them incorrect latency and throughput. This patch fixes such instructions to match the current N3 SWOG.