Skip to content

Commit

Permalink
[X86] Update register RCL/RCR by 1 and immediate scheduling for Intel…
Browse files Browse the repository at this point in the history
… CPUs

Most Intel CPU scheduler files lumped the immediate and 1 instructions
together, but uops.info shows they are quite different.

For the most part the by 1 instructions were pretty accurate to the uops.info
data except the latency was 3 instead of 2 as uops.info indicates.

The by immediate instructions need 7 or 8 uops and have higher latency.

It looks like the 8-bit by immediate instructions may need even more
uops, but I just lumped them with the 16/32/64.

Noticed while checking out PR53648. So mostly I cared about the by 1
instructions.

Reviewed By: RKSimon, pengfei

Differential Revision: https://reviews.llvm.org/D119217
  • Loading branch information
topperc committed Feb 8, 2022
1 parent c151225 commit 56d6ccd
Show file tree
Hide file tree
Showing 14 changed files with 329 additions and 245 deletions.
20 changes: 17 additions & 3 deletions llvm/lib/Target/X86/X86SchedBroadwell.td
Expand Up @@ -814,12 +814,26 @@ def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
def: InstRW<[BWWriteResGroup34], (instregex "CLD")>;

def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
let Latency = 3;
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
def: InstRW<[BWWriteResGroup35], (instregex "RCL(8|16|32|64)r(1|i)",
"RCR(8|16|32|64)r(1|i)")>;
def: InstRW<[BWWriteResGroup35], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def BWWriteResGroup36 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
let Latency = 5;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[BWWriteResGroup36], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def BWWriteResGroup36b : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
let Latency = 6;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[BWWriteResGroup36b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
let Latency = 3;
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/Target/X86/X86SchedHaswell.td
Expand Up @@ -1299,12 +1299,26 @@ def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;

def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 3;
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
def: InstRW<[HWWriteResGroup59], (instregex "RCL(8|16|32|64)r(1|i)",
"RCR(8|16|32|64)r(1|i)")>;
def: InstRW<[HWWriteResGroup59], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def HWWriteResGroup60 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
let Latency = 5;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[HWWriteResGroup60], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def HWWriteResGroup60b : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
let Latency = 6;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[HWWriteResGroup60b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
let Latency = 4;
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/Target/X86/X86SchedIceLake.td
Expand Up @@ -923,12 +923,26 @@ def ICXWriteResGroup43 : SchedWriteRes<[ICXPort237,ICXPort0156]> {
def: InstRW<[ICXWriteResGroup43], (instrs MFENCE)>;

def ICXWriteResGroup44 : SchedWriteRes<[ICXPort06,ICXPort0156]> {
let Latency = 3;
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
def: InstRW<[ICXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
"RCR(8|16|32|64)r(1|i)")>;
def: InstRW<[ICXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def ICXWriteResGroup44b : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
let Latency = 5;
let NumMicroOps = 7;
let ResourceCycles = [2,3,2];
}
def: InstRW<[ICXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def ICXWriteResGroup44c : SchedWriteRes<[ICXPort1,ICXPort06,ICXPort0156]> {
let Latency = 6;
let NumMicroOps = 7;
let ResourceCycles = [2,3,2];
}
def: InstRW<[ICXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def ICXWriteResGroup45 : SchedWriteRes<[ICXPort0,ICXPort4,ICXPort237]> {
let Latency = 3;
Expand Down
26 changes: 20 additions & 6 deletions llvm/lib/Target/X86/X86SchedSandyBridge.td
Expand Up @@ -678,13 +678,27 @@ def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
}
def: InstRW<[SBWriteResGroup22], (instregex "(V?)EXTRACTPSrr")>;

def SBWriteResGroup23 : SchedWriteRes<[SBPort05]> {
def SBWriteResGroup23 : SchedWriteRes<[SBPort05,SBPort015]> {
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [3];
let ResourceCycles = [2,1];
}
def: InstRW<[SBWriteResGroup23], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def SBWriteResGroup24 : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
let Latency = 3;
let NumMicroOps = 8;
let ResourceCycles = [1,1,4,2];
}
def: InstRW<[SBWriteResGroup24], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def SBWriteResGroup24b : SchedWriteRes<[SBPort1,SBPort5,SBPort05,SBPort015]> {
let Latency = 4;
let NumMicroOps = 8;
let ResourceCycles = [1,1,4,2];
}
def: InstRW<[SBWriteResGroup23], (instregex "RCL(8|16|32|64)r1",
"RCR(8|16|32|64)r1")>;
def: InstRW<[SBWriteResGroup24b], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def SBWriteResGroup25_1 : SchedWriteRes<[SBPort23,SBPort015]> {
let Latency = 7;
Expand Down Expand Up @@ -727,8 +741,8 @@ def SBWriteResGroup76 : SchedWriteRes<[SBPort05]> {
let NumMicroOps = 8;
let ResourceCycles = [8];
}
def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)r(i|CL)",
"RCR(8|16|32|64)r(i|CL)")>;
def: InstRW<[SBWriteResGroup76], (instregex "RCL(8|16|32|64)rCL",
"RCR(8|16|32|64)rCL")>;

def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
let Latency = 5;
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/Target/X86/X86SchedSkylakeClient.td
Expand Up @@ -836,12 +836,26 @@ def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
def: InstRW<[SKLWriteResGroup41], (instrs MFENCE)>;

def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
let Latency = 3;
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
def: InstRW<[SKLWriteResGroup42], (instregex "RCL(8|16|32|64)r(1|i)",
"RCR(8|16|32|64)r(1|i)")>;
def: InstRW<[SKLWriteResGroup42], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def SKLWriteResGroup42b : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
let Latency = 5;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[SKLWriteResGroup42b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def SKLWriteResGroup42c : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
let Latency = 6;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[SKLWriteResGroup42c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
let Latency = 3;
Expand Down
20 changes: 17 additions & 3 deletions llvm/lib/Target/X86/X86SchedSkylakeServer.td
Expand Up @@ -905,12 +905,26 @@ def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
def: InstRW<[SKXWriteResGroup43], (instrs MFENCE)>;

def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
let Latency = 3;
let Latency = 2;
let NumMicroOps = 3;
let ResourceCycles = [1,2];
}
def: InstRW<[SKXWriteResGroup44], (instregex "RCL(8|16|32|64)r(1|i)",
"RCR(8|16|32|64)r(1|i)")>;
def: InstRW<[SKXWriteResGroup44], (instrs RCL8r1, RCL16r1, RCL32r1, RCL64r1,
RCR8r1, RCR16r1, RCR32r1, RCR64r1)>;

def SKXWriteResGroup44b : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
let Latency = 5;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[SKXWriteResGroup44b], (instrs RCR8ri, RCR16ri, RCR32ri, RCR64ri)>;

def SKXWriteResGroup44c : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
let Latency = 6;
let NumMicroOps = 8;
let ResourceCycles = [2,4,2];
}
def: InstRW<[SKXWriteResGroup44c], (instrs RCL8ri, RCL16ri, RCL32ri, RCL64ri)>;

def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
let Latency = 3;
Expand Down
66 changes: 33 additions & 33 deletions llvm/test/tools/llvm-mca/X86/Barcelona/resources-x86_64.s
Expand Up @@ -1506,48 +1506,48 @@ xorq (%rax), %rdi
# CHECK-NEXT: 1 100 0.33 U outsw (%rsi), %dx
# CHECK-NEXT: 1 100 0.33 U outsl (%rsi), %dx
# CHECK-NEXT: 4 4 1.33 * * U pause
# CHECK-NEXT: 3 2 1.50 rclb %dil
# CHECK-NEXT: 3 2 1.50 rcrb %dil
# CHECK-NEXT: 3 2 1.00 rclb %dil
# CHECK-NEXT: 3 2 1.00 rcrb %dil
# CHECK-NEXT: 11 11 3.50 * rclb (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrb (%rax)
# CHECK-NEXT: 8 5 4.00 rclb $7, %dil
# CHECK-NEXT: 8 5 4.00 rcrb $7, %dil
# CHECK-NEXT: 8 4 2.67 rclb $7, %dil
# CHECK-NEXT: 8 3 2.67 rcrb $7, %dil
# CHECK-NEXT: 11 11 3.50 * rclb $7, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrb $7, (%rax)
# CHECK-NEXT: 8 5 4.00 rclb %cl, %dil
# CHECK-NEXT: 8 5 4.00 rcrb %cl, %dil
# CHECK-NEXT: 11 11 3.50 * rclb %cl, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrb %cl, (%rax)
# CHECK-NEXT: 3 2 1.50 rclw %di
# CHECK-NEXT: 3 2 1.50 rcrw %di
# CHECK-NEXT: 3 2 1.00 rclw %di
# CHECK-NEXT: 3 2 1.00 rcrw %di
# CHECK-NEXT: 11 11 3.50 * rclw (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrw (%rax)
# CHECK-NEXT: 8 5 4.00 rclw $7, %di
# CHECK-NEXT: 8 5 4.00 rcrw $7, %di
# CHECK-NEXT: 8 4 2.67 rclw $7, %di
# CHECK-NEXT: 8 3 2.67 rcrw $7, %di
# CHECK-NEXT: 11 11 3.50 * rclw $7, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrw $7, (%rax)
# CHECK-NEXT: 8 5 4.00 rclw %cl, %di
# CHECK-NEXT: 8 5 4.00 rcrw %cl, %di
# CHECK-NEXT: 11 11 3.50 * rclw %cl, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrw %cl, (%rax)
# CHECK-NEXT: 3 2 1.50 rcll %edi
# CHECK-NEXT: 3 2 1.50 rcrl %edi
# CHECK-NEXT: 3 2 1.00 rcll %edi
# CHECK-NEXT: 3 2 1.00 rcrl %edi
# CHECK-NEXT: 11 11 3.50 * rcll (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrl (%rax)
# CHECK-NEXT: 8 5 4.00 rcll $7, %edi
# CHECK-NEXT: 8 5 4.00 rcrl $7, %edi
# CHECK-NEXT: 8 4 2.67 rcll $7, %edi
# CHECK-NEXT: 8 3 2.67 rcrl $7, %edi
# CHECK-NEXT: 11 11 3.50 * rcll $7, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrl $7, (%rax)
# CHECK-NEXT: 8 5 4.00 rcll %cl, %edi
# CHECK-NEXT: 8 5 4.00 rcrl %cl, %edi
# CHECK-NEXT: 11 11 3.50 * rcll %cl, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrl %cl, (%rax)
# CHECK-NEXT: 3 2 1.50 rclq %rdi
# CHECK-NEXT: 3 2 1.50 rcrq %rdi
# CHECK-NEXT: 3 2 1.00 rclq %rdi
# CHECK-NEXT: 3 2 1.00 rcrq %rdi
# CHECK-NEXT: 11 11 3.50 * rclq (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrq (%rax)
# CHECK-NEXT: 8 5 4.00 rclq $7, %rdi
# CHECK-NEXT: 8 5 4.00 rcrq $7, %rdi
# CHECK-NEXT: 8 4 2.67 rclq $7, %rdi
# CHECK-NEXT: 8 3 2.67 rcrq $7, %rdi
# CHECK-NEXT: 11 11 3.50 * rclq $7, (%rax)
# CHECK-NEXT: 11 11 3.50 * rcrq $7, (%rax)
# CHECK-NEXT: 8 5 4.00 rclq %cl, %rdi
Expand Down Expand Up @@ -1953,7 +1953,7 @@ xorq (%rax), %rdi

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: 160.00 - 670.17 294.67 361.00 687.17 455.50 455.50
# CHECK-NEXT: 160.00 - 658.17 310.67 361.00 683.17 455.50 455.50

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
Expand Down Expand Up @@ -2433,48 +2433,48 @@ xorq (%rax), %rdi
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - - outsw (%rsi), %dx
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - - outsl (%rsi), %dx
# CHECK-NEXT: - - 1.00 1.00 - 2.00 - - pause
# CHECK-NEXT: - - 1.50 - - 1.50 - - rclb %dil
# CHECK-NEXT: - - 1.50 - - 1.50 - - rcrb %dil
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rclb %dil
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rcrb %dil
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclb (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrb (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclb $7, %dil
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrb $7, %dil
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rclb $7, %dil
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rcrb $7, %dil
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclb $7, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrb $7, (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclb %cl, %dil
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrb %cl, %dil
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclb %cl, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrb %cl, (%rax)
# CHECK-NEXT: - - 1.50 - - 1.50 - - rclw %di
# CHECK-NEXT: - - 1.50 - - 1.50 - - rcrw %di
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rclw %di
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rcrw %di
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclw (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrw (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclw $7, %di
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrw $7, %di
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rclw $7, %di
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rcrw $7, %di
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclw $7, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrw $7, (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclw %cl, %di
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrw %cl, %di
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclw %cl, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrw %cl, (%rax)
# CHECK-NEXT: - - 1.50 - - 1.50 - - rcll %edi
# CHECK-NEXT: - - 1.50 - - 1.50 - - rcrl %edi
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rcll %edi
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rcrl %edi
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcll (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrl (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcll $7, %edi
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrl $7, %edi
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rcll $7, %edi
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rcrl $7, %edi
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcll $7, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrl $7, (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcll %cl, %edi
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrl %cl, %edi
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcll %cl, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrl %cl, (%rax)
# CHECK-NEXT: - - 1.50 - - 1.50 - - rclq %rdi
# CHECK-NEXT: - - 1.50 - - 1.50 - - rcrq %rdi
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rclq %rdi
# CHECK-NEXT: - - 1.33 0.33 - 1.33 - - rcrq %rdi
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclq (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrq (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclq $7, %rdi
# CHECK-NEXT: - - 4.00 - - 4.00 - - rcrq $7, %rdi
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rclq $7, %rdi
# CHECK-NEXT: - - 2.67 1.67 - 3.67 - - rcrq $7, %rdi
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rclq $7, (%rax)
# CHECK-NEXT: - - 3.50 - - 3.50 2.00 2.00 rcrq $7, (%rax)
# CHECK-NEXT: - - 4.00 - - 4.00 - - rclq %cl, %rdi
Expand Down

0 comments on commit 56d6ccd

Please sign in to comment.