Skip to content

Commit fa0bb6a

Browse files
authored
[X86] Fix throughput typo in XMM/YMM PACK/PALIGNR schedule classes (#157867)
Only the ZMM PACK/PALIGNR instructions are half-rate on znver4 - confirmed with AMD SOG, uops.info and Agner Noticed because comparing costs table shuffle costs vs llvm-mca costs kept giving weird numbers if I tested it on znver4 vs any other avx2/avx512 target It looks like there's other znver4 overrides that make this mistake but many of these need cleaning up properly to use the (currently unused) default classes
1 parent 3751b6b commit fa0bb6a

File tree

7 files changed

+117
-101
lines changed

7 files changed

+117
-101
lines changed

llvm/lib/Target/X86/X86ScheduleZnver4.td

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1534,9 +1534,9 @@ def Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr : SchedWriteRes<[Zn4FPFMisc01]> {
15341534
let NumMicroOps = 1;
15351535
}
15361536
def : InstRW<[Zn4WriteVFIXUPIMMPDZrr_VRANGESDrr], (instregex
1537-
"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
1537+
"VFIXUPIMM(S|P)(S|D)(Z|Z128|Z256?)rrik", "VFIXUPIMM(S|P)(S|D)(Z?|Z128?|Z256?)rrikz",
15381538
"VFIXUPIMM(S|P)(S|D)(Z128|Z256?)rri", "VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)",
1539-
"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
1539+
"VRANGE(S|P)(S|D)(Z|Z128|Z256?)rri(b?)k","VRANGE(S|P)(S|D)(Z?|Z128?|Z256?)rri(b?)kz"
15401540
)>;
15411541

15421542
// SCALE & REDUCE instructions
@@ -1567,7 +1567,7 @@ def Zn4WriteBUSDr_VPMADDr: SchedWriteRes<[Zn4FPFMisc01]> {
15671567
let NumMicroOps = 1;
15681568
}
15691569
def : InstRW<[Zn4WriteBUSDr_VPMADDr], (instregex
1570-
"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
1570+
"VPDP(BU|WS)(S|P)(S|D|DS)(Z|Z128|Z256)(r|rk|rkz)",
15711571
"VPMADD52(H|L)UQ(Z|Z128|Z256)(r|rk|rkz)"
15721572
)>;
15731573

@@ -1586,7 +1586,7 @@ def : InstRW<[Zn4WriteSHIFTrr], (instregex
15861586
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z?|Z128?|Z256?)(rr|rrk|rrkz)",
15871587
"(V?)P(ROL|ROR)(D|Q|VD|VQ)(Z256?)(ri|rik|rikz)",
15881588
"(V?)P(ROL|ROR)(D|Q)(Z?|Z128?)(ri|rik|rikz)",
1589-
"VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
1589+
"VPSHUFBITQMBZ128rr", "VFMSUB231SSZrkz_Int"
15901590
)>;
15911591

15921592
def Zn4WriteSHIFTri: SchedWriteRes<[Zn4FPFMisc01]> {
@@ -1598,24 +1598,40 @@ def : InstRW<[Zn4WriteSHIFTri], (instregex
15981598
"VP(SLL|SRL|SRA)(D|Q|W)(Z|Z128|Z256?)(ri|rik|rikz)"
15991599
)>;
16001600

1601-
// ALIGN Instructions
1602-
def Zn4WriteALIGN: SchedWriteRes<[Zn4FPFMisc12]> {
1601+
// ALIGNR Instructions
1602+
def Zn4WriteALIGNR: SchedWriteRes<[Zn4FPFMisc12]> {
1603+
let Latency = 2;
1604+
let ReleaseAtCycles = [1];
1605+
let NumMicroOps = 1;
1606+
}
1607+
def : InstRW<[Zn4WriteALIGNR], (instregex
1608+
"(V?)PALIGNR(Y?|Z128?|Z256?)(rri|rrik|rrikz)"
1609+
)>;
1610+
def Zn4WriteALIGNRZ: SchedWriteRes<[Zn4FPFMisc12]> {
16031611
let Latency = 2;
16041612
let ReleaseAtCycles = [2];
16051613
let NumMicroOps = 1;
16061614
}
1607-
def : InstRW<[Zn4WriteALIGN], (instregex
1608-
"(V?)PALIGNR(Z?|Z128?|Z256?)(rri|rrik|rrikz)"
1615+
def : InstRW<[Zn4WriteALIGNRZ], (instregex
1616+
"(V?)PALIGNRZ(rri|rrik|rrikz)"
16091617
)>;
16101618

1611-
//PACK Instructions
1619+
// PACK Instructions
16121620
def Zn4WritePACK: SchedWriteRes<[Zn4FPFMisc12]> {
16131621
let Latency = 2;
1614-
let ReleaseAtCycles = [2];
1622+
let ReleaseAtCycles = [1];
16151623
let NumMicroOps = 1;
16161624
}
16171625
def : InstRW<[Zn4WritePACK], (instregex
1618-
"(V?)PACK(SS|US)(DW|WB)(Z?|Z128?|Z256?)(rr|rrk|rrkz)"
1626+
"(V?)PACK(SS|US)(DW|WB)(Y?|Z128?|Z256?)(rr|rrk|rrkz)"
1627+
)>;
1628+
def Zn4WritePACKZ: SchedWriteRes<[Zn4FPFMisc12]> {
1629+
let Latency = 2;
1630+
let ReleaseAtCycles = [2];
1631+
let NumMicroOps = 1;
1632+
}
1633+
def : InstRW<[Zn4WritePACKZ], (instregex
1634+
"(V?)PACK(SS|US)(DW|WB)Z(rr|rrk|rrkz)"
16191635
)>;
16201636

16211637
// MAX and MIN Instructions

llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,13 +1365,13 @@ vzeroupper
13651365
# CHECK-NEXT: 1 8 0.50 * vpabsd (%rax), %xmm2
13661366
# CHECK-NEXT: 1 2 1.00 vpabsw %xmm0, %xmm2
13671367
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %xmm2
1368-
# CHECK-NEXT: 1 2 1.00 vpackssdw %xmm0, %xmm1, %xmm2
1368+
# CHECK-NEXT: 1 2 0.50 vpackssdw %xmm0, %xmm1, %xmm2
13691369
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %xmm1, %xmm2
1370-
# CHECK-NEXT: 1 2 1.00 vpacksswb %xmm0, %xmm1, %xmm2
1370+
# CHECK-NEXT: 1 2 0.50 vpacksswb %xmm0, %xmm1, %xmm2
13711371
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %xmm1, %xmm2
1372-
# CHECK-NEXT: 1 2 1.00 vpackusdw %xmm0, %xmm1, %xmm2
1372+
# CHECK-NEXT: 1 2 0.50 vpackusdw %xmm0, %xmm1, %xmm2
13731373
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %xmm1, %xmm2
1374-
# CHECK-NEXT: 1 2 1.00 vpackuswb %xmm0, %xmm1, %xmm2
1374+
# CHECK-NEXT: 1 2 0.50 vpackuswb %xmm0, %xmm1, %xmm2
13751375
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %xmm1, %xmm2
13761376
# CHECK-NEXT: 1 1 0.25 vpaddb %xmm0, %xmm1, %xmm2
13771377
# CHECK-NEXT: 1 8 0.50 * vpaddb (%rax), %xmm1, %xmm2
@@ -1389,7 +1389,7 @@ vzeroupper
13891389
# CHECK-NEXT: 1 8 0.50 * vpaddusw (%rax), %xmm1, %xmm2
13901390
# CHECK-NEXT: 1 1 0.25 vpaddw %xmm0, %xmm1, %xmm2
13911391
# CHECK-NEXT: 1 8 0.50 * vpaddw (%rax), %xmm1, %xmm2
1392-
# CHECK-NEXT: 1 2 1.00 vpalignr $1, %xmm0, %xmm1, %xmm2
1392+
# CHECK-NEXT: 1 2 0.50 vpalignr $1, %xmm0, %xmm1, %xmm2
13931393
# CHECK-NEXT: 1 8 0.50 * vpalignr $1, (%rax), %xmm1, %xmm2
13941394
# CHECK-NEXT: 1 1 0.25 vpand %xmm0, %xmm1, %xmm2
13951395
# CHECK-NEXT: 1 8 0.50 * vpand (%rax), %xmm1, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
17491749

17501750
# CHECK: Resource pressure per iteration:
17511751
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
1752-
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 396.08 270.58 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
1752+
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
17531753

17541754
# CHECK: Resource pressure by instruction:
17551755
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2088,13 +2088,13 @@ vzeroupper
20882088
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpabsd (%rax), %xmm2
20892089
# CHECK-NEXT: - - - - - - - - 1.00 1.00 - - - - - - - - - - - - - vpabsw %xmm0, %xmm2
20902090
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpabsw (%rax), %xmm2
2091-
# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackssdw %xmm0, %xmm1, %xmm2
2091+
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackssdw %xmm0, %xmm1, %xmm2
20922092
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackssdw (%rax), %xmm1, %xmm2
2093-
# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpacksswb %xmm0, %xmm1, %xmm2
2093+
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpacksswb %xmm0, %xmm1, %xmm2
20942094
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpacksswb (%rax), %xmm1, %xmm2
2095-
# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackusdw %xmm0, %xmm1, %xmm2
2095+
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackusdw %xmm0, %xmm1, %xmm2
20962096
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackusdw (%rax), %xmm1, %xmm2
2097-
# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpackuswb %xmm0, %xmm1, %xmm2
2097+
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpackuswb %xmm0, %xmm1, %xmm2
20982098
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpackuswb (%rax), %xmm1, %xmm2
20992099
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpaddb %xmm0, %xmm1, %xmm2
21002100
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddb (%rax), %xmm1, %xmm2
@@ -2112,7 +2112,7 @@ vzeroupper
21122112
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddusw (%rax), %xmm1, %xmm2
21132113
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpaddw %xmm0, %xmm1, %xmm2
21142114
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpaddw (%rax), %xmm1, %xmm2
2115-
# CHECK-NEXT: - - - - - - - - - 1.00 1.00 - - - - - - - - - - - - vpalignr $1, %xmm0, %xmm1, %xmm2
2115+
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - - - - - - - - - - - - vpalignr $1, %xmm0, %xmm1, %xmm2
21162116
# CHECK-NEXT: - - - - - - - - - 0.50 0.50 - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpalignr $1, (%rax), %xmm1, %xmm2
21172117
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpand %xmm0, %xmm1, %xmm2
21182118
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpand (%rax), %xmm1, %xmm2

llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -484,13 +484,13 @@ vpxor (%rax), %ymm1, %ymm2
484484
# CHECK-NEXT: 1 8 0.50 * vpabsd (%rax), %ymm2
485485
# CHECK-NEXT: 1 1 0.50 vpabsw %ymm0, %ymm2
486486
# CHECK-NEXT: 1 8 0.50 * vpabsw (%rax), %ymm2
487-
# CHECK-NEXT: 1 1 0.50 vpackssdw %ymm0, %ymm1, %ymm2
487+
# CHECK-NEXT: 1 2 0.50 vpackssdw %ymm0, %ymm1, %ymm2
488488
# CHECK-NEXT: 1 8 0.50 * vpackssdw (%rax), %ymm1, %ymm2
489-
# CHECK-NEXT: 1 1 0.50 vpacksswb %ymm0, %ymm1, %ymm2
489+
# CHECK-NEXT: 1 2 0.50 vpacksswb %ymm0, %ymm1, %ymm2
490490
# CHECK-NEXT: 1 8 0.50 * vpacksswb (%rax), %ymm1, %ymm2
491-
# CHECK-NEXT: 1 1 0.50 vpackusdw %ymm0, %ymm1, %ymm2
491+
# CHECK-NEXT: 1 2 0.50 vpackusdw %ymm0, %ymm1, %ymm2
492492
# CHECK-NEXT: 1 8 0.50 * vpackusdw (%rax), %ymm1, %ymm2
493-
# CHECK-NEXT: 1 1 0.50 vpackuswb %ymm0, %ymm1, %ymm2
493+
# CHECK-NEXT: 1 2 0.50 vpackuswb %ymm0, %ymm1, %ymm2
494494
# CHECK-NEXT: 1 8 0.50 * vpackuswb (%rax), %ymm1, %ymm2
495495
# CHECK-NEXT: 1 1 0.25 vpaddb %ymm0, %ymm1, %ymm2
496496
# CHECK-NEXT: 1 8 0.50 * vpaddb (%rax), %ymm1, %ymm2
@@ -508,7 +508,7 @@ vpxor (%rax), %ymm1, %ymm2
508508
# CHECK-NEXT: 1 8 0.50 * vpaddusw (%rax), %ymm1, %ymm2
509509
# CHECK-NEXT: 1 1 0.25 vpaddw %ymm0, %ymm1, %ymm2
510510
# CHECK-NEXT: 1 8 0.50 * vpaddw (%rax), %ymm1, %ymm2
511-
# CHECK-NEXT: 1 1 0.50 vpalignr $1, %ymm0, %ymm1, %ymm2
511+
# CHECK-NEXT: 1 2 0.50 vpalignr $1, %ymm0, %ymm1, %ymm2
512512
# CHECK-NEXT: 1 8 0.50 * vpalignr $1, (%rax), %ymm1, %ymm2
513513
# CHECK-NEXT: 1 1 0.25 vpand %ymm0, %ymm1, %ymm2
514514
# CHECK-NEXT: 1 8 0.50 * vpand (%rax), %ymm1, %ymm2

0 commit comments

Comments
 (0)