-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[X86] Fix throughput typo in XMM/YMM PACK/PALIGNR schedule classes #157867
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Fix throughput typo in XMM/YMM PACK/PALIGNR schedule classes #157867
Conversation
Only the ZMM PACK/PALIGNR instructions are half-rate on znver4 - confirmed with AMD SOG, uops.info and Agner Noticed because comparing costs table shuffle costs vs llvm-mca costs kept giving weird numbers if I tested it on znver4 vs x86-64-v4 It looks like there's other znver4 overrides that make this mistake but many of these need cleaning up to use the (currently unused) default classes properly
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesOnly the ZMM PACK/PALIGNR instructions are half-rate on znver4 - confirmed with AMD SOG, uops.info and Agner Noticed because comparing costs table shuffle costs vs llvm-mca costs kept giving weird numbers if I tested it on znver4 vs any other avx2/avx512 target It looks like there's other znver4 overrides that make this mistake but many of these need cleaning up properly to use the (currently unused) default classes Patch is 57.28 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/157867.diff 7 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index f4b8f8927b1b5..a93c7e3a82f17 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
- "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
+ "VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
"VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
- "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
+ "VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
)>;
// SCALE & REDUCE instructions
@@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
- "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
+ "VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
"VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
)>;
@@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
"(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
- "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
+ "VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
)>;
def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
@@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex
"VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
)>;
-// ALIGN Instructions
-def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
+// ALIGNR Instructions
+def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [1];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WriteALIGNR], (instregex
+ "(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)"
+ )>;
+def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
let ReleaseAtCycles = [2];
let NumMicroOps = 1;
}
-def : InstRW<[Zn4WriteALIGN], (instregex
- "(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
+def : InstRW<[Zn4WriteALIGNRZ], (instregex
+ "(V?)PALIGNRZ(rri|rrik|rrikz)"
)>;
-//PACK Instructions
+// PACK Instructions
def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
let Latency = 2;
- let ReleaseAtCycles = [2];
+ let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WritePACK], (instregex
- "(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
+ "(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)"
+ )>;
+def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> {
+ let Latency = 2;
+ let ReleaseAtCycles = [2];
+ let NumMicroOps = 1;
+}
+def : InstRW<[Zn4WritePACKZ], (instregex
+ "(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)"
)>;
// MAX and MIN Instructions
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
index 9b721c933ab51..1ffe53366fdb0 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
@@ -1365,13 +1365,13 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpabsd (%rax), %xmm2
# CHECK-NEXT: 1 2 1.00 vpabsw %xmm0, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %xmm2
-# CHECK-NEXT: 1 2 1.00 vpackssdw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vpackssdw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 2 1.00 vpacksswb %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vpacksswb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 2 1.00 vpackusdw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vpackusdw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 2 1.00 vpackuswb %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vpackuswb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpaddb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpaddb (%rax), %xmm1, %xmm2
@@ -1389,7 +1389,7 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpaddusw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpaddw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpaddw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: 1 2 1.00 vpalignr $1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1 2 0.50 vpalignr $1, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpalignr $1, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpand %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpand (%rax), %xmm1, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 396.08 270.58 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
+# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2088,13 +2088,13 @@ vzeroupper
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpabsd (%rax), %xmm2
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vpabsw %xmm0, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpabsw (%rax), %xmm2
-# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackssdw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackssdw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackssdw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpacksswb %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpacksswb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpacksswb (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackusdw %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackusdw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackusdw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackuswb %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackuswb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackuswb (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpaddb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddb (%rax), %xmm1, %xmm2
@@ -2112,7 +2112,7 @@ vzeroupper
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddusw (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpaddw %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddw (%rax), %xmm1, %xmm2
-# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpalignr $1, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpalignr $1, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpalignr $1, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpand %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpand (%rax), %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
index 25e367c96e44b..6dc5bacde9059 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
@@ -484,13 +484,13 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpabsd (%rax), %ymm2
# CHECK-NEXT: 1 1 0.50 vpabsw %ymm0, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %ymm2
-# CHECK-NEXT: 1 1 0.50 vpackssdw %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.50 vpackssdw %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 1 0.50 vpacksswb %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.50 vpacksswb %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 1 0.50 vpackusdw %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.50 vpackusdw %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 1 0.50 vpackuswb %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.50 vpackuswb %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.25 vpaddb %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpaddb (%rax), %ymm1, %ymm2
@@ -508,7 +508,7 @@ vpxor (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpaddusw (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.25 vpaddw %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpaddw (%rax), %ymm1, %ymm2
-# CHECK-NEXT: 1 1 0.50 vpalignr $1, %ymm0, %ymm1, %ymm2
+# CHECK-NEXT: 1 2 0.50 vpalignr $1, %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpalignr $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.25 vpand %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 8 0.50 * vpand (%rax), %ymm1, %ymm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bwvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bwvl.s
index a298dd69ee9b3..79f2cb4b7ab82 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bwvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bwvl.s
@@ -1166,53 +1166,53 @@ vpunpcklwd (%rax), %ymm17, %ymm19 {z}{k1}
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %ymm19 {%k1}
# CHECK-NEXT: 1 1 0.25 vpabsw %ymm16, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpackssdw %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 2 0.50 vpackssdw %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %xmm17, %xmm19
-# CHECK-NEXT: 1 2 1.00 vpackssdw %xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpackssdw %xmm16, %xmm17, %xmm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %xmm17, %xmm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpackssdw %xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpackssdw %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpackssdw %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 1 2 0.50 vpackssdw %ymm16, %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %ymm17, %ymm19
-# CHECK-NEXT: 1 2 1.00 vpackssdw %ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpackssdw %ymm16, %ymm17, %ymm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %ymm17, %ymm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpackssdw %ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpackssdw %ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpacksswb %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 2 0.50 vpacksswb %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %xmm17, %xmm19
-# CHECK-NEXT: 1 2 1.00 vpacksswb %xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpacksswb %xmm16, %xmm17, %xmm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %xmm17, %xmm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpacksswb %xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpacksswb %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpacksswb %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 1 2 0.50 vpacksswb %ymm16, %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %ymm17, %ymm19
-# CHECK-NEXT: 1 2 1.00 vpacksswb %ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpacksswb %ymm16, %ymm17, %ymm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %ymm17, %ymm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpacksswb %ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpacksswb %ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpackusdw %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 2 0.50 vpackusdw %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %xmm17, %xmm19
-# CHECK-NEXT: 1 2 1.00 vpackusdw %xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpackusdw %xmm16, %xmm17, %xmm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %xmm17, %xmm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpackusdw %xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpackusdw %xmm16, %xmm17, %xmm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpackusdw %ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 1 2 0.50 vpackusdw %ymm16, %ymm17, %ymm19
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %ymm17, %ymm19
-# CHECK-NEXT: 1 2 1.00 vpackusdw %ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpackusdw %ymm16, %ymm17, %ymm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %ymm17, %ymm19 {%k1}
-# CHECK-NEXT: 1 2 1.00 vpackusdw %ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 1 2 0.50 vpackusdw %ymm16, %ymm17, %ymm19 {%k1} {z}
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT: 1 2 1.00 vpackuswb %xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 1 2 0.50 vpackuswb %xmm16, %xmm17, %xmm19
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %xmm17, %xmm19
-# CHECK-NEXT: 1 2 1.00 vpackuswb %xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 1 2 0.50 vpackuswb %xmm16, %xmm17, %xmm19 {%k1}
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %xmm17, %xmm19 {%...
[truncated]
|
Only the ZMM PACK/PALIGNR instructions are half-rate on znver4 - confirmed with AMD SOG, uops.info and Agner
Noticed because comparing costs table shuffle costs vs llvm-mca costs kept giving weird numbers if I tested it on znver4 vs any other avx2/avx512 target
It looks like there's other znver4 overrides that make this mistake but many of these need cleaning up properly to use the (currently unused) default classes