Skip to content

Commit

Permalink
[X86] AMD Znver2 (Rome) Scheduler enablement
Browse files Browse the repository at this point in the history
The patch gives out the details of the znver2 scheduler model.
There are few improvements with respect to execution units, latencies and
throughput when compared with znver1.
The tests that were present for znver1 for llvm-mca tool were replicated.
The latencies, execution units, timeline and throughput information are updated for znver2.

Reviewers: craig.topper, Simon Pilgrim

Differential Revision: https://reviews.llvm.org/D66088
  • Loading branch information
ganeshgit committed Jan 9, 2020
1 parent 1a1dbea commit 3408940
Show file tree
Hide file tree
Showing 59 changed files with 14,151 additions and 8 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ include "X86SchedHaswell.td"
include "X86SchedBroadwell.td"
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleZnver2.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
Expand Down Expand Up @@ -1204,7 +1205,7 @@ def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;

def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;

def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
Feature3DNowA, FeatureInsertVZEROUPPER]>;
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/X86/X86InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2851,7 +2851,7 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in {
//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
let SchedRW = [WriteSystem] in {
let SchedRW = [WriteLoad] in {
let Uses = [EAX] in
def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
TB, Requires<[HasCLZERO, Not64BitMode]>;
Expand Down
1,548 changes: 1,548 additions & 0 deletions llvm/lib/Target/X86/X86ScheduleZnver2.td

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions llvm/test/MC/X86/x86_long_nop.s
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=btver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver1 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver1 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver2 %s | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver2 | llvm-objdump -d -no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15

# Ensure alignment directives also emit sequences of 10, 11 and 15-byte NOPs on processors
# capable of using long NOPs.
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/X86/Generic/resources-clzero.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ clzero
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 100 0.33 U clzero
# CHECK-NEXT: 1 5 0.50 U clzero

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
Expand All @@ -26,8 +26,8 @@ clzero

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - -
# CHECK-NEXT: - - - - - - 0.50 0.50

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - 0.33 0.33 - 0.33 - - clzero
# CHECK-NEXT: - - - - - - 0.50 0.50 clzero
6 changes: 3 additions & 3 deletions llvm/test/tools/llvm-mca/X86/Znver1/resources-clzero.s
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ clzero
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 100 0.25 U clzero
# CHECK-NEXT: 1 8 0.50 U clzero

# CHECK: Resources:
# CHECK-NEXT: [0] - ZnAGU0
Expand All @@ -30,8 +30,8 @@ clzero

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11]
# CHECK-NEXT: - - - - - - - - - - - -
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions:
# CHECK-NEXT: - - - - - - - - - - - - clzero
# CHECK-NEXT: 0.50 0.50 - - - - - - - - - - clzero
47 changes: 47 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1 -resource-pressure=false -timeline < %s | FileCheck %s

imul %rax, %rbx
lzcnt %ax, %bx
add %ecx, %ebx

# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 3
# CHECK-NEXT: Total Cycles: 9
# CHECK-NEXT: Total uOps: 4

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.44
# CHECK-NEXT: IPC: 0.33
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 4 1.00 imulq %rax, %rbx
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx
# CHECK-NEXT: 1 1 0.25 addl %ecx, %ebx

# CHECK: Timeline view:
# CHECK-NEXT: Index 012345678

# CHECK: [0,0] DeeeeER . imulq %rax, %rbx
# CHECK-NEXT: [0,1] D====eER. lzcntw %ax, %bx
# CHECK-NEXT: [0,2] D=====eER addl %ecx, %ebx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 imulq %rax, %rbx
# CHECK-NEXT: 1. 1 5.0 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 addl %ecx, %ebx
91 changes: 91 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-3.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=6 < %s | FileCheck %s

# The ILP is limited by the false dependency on %dx. So, the mov cannot execute
# in parallel with the add.

add %cx, %dx
mov %ax, %dx
xor %bx, %dx

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 4503
# CHECK-NEXT: Total uOps: 4500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.25 addw %cx, %dx
# CHECK-NEXT: 1 1 0.25 movw %ax, %dx
# CHECK-NEXT: 1 1 0.25 xorw %bx, %dx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - - 0.75 0.75 0.75 0.75 - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - addw %cx, %dx
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - movw %ax, %dx
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - xorw %bx, %dx

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 0

# CHECK: [0,0] DeER . . . . addw %cx, %dx
# CHECK-NEXT: [0,1] D=eER. . . . movw %ax, %dx
# CHECK-NEXT: [0,2] D==eER . . . xorw %bx, %dx
# CHECK-NEXT: [1,0] D===eER . . . addw %cx, %dx
# CHECK-NEXT: [1,1] .D===eER . . . movw %ax, %dx
# CHECK-NEXT: [1,2] .D====eER . . . xorw %bx, %dx
# CHECK-NEXT: [2,0] .D=====eER. . . addw %cx, %dx
# CHECK-NEXT: [2,1] .D======eER . . movw %ax, %dx
# CHECK-NEXT: [2,2] . D======eER . . xorw %bx, %dx
# CHECK-NEXT: [3,0] . D=======eER . . addw %cx, %dx
# CHECK-NEXT: [3,1] . D========eER . . movw %ax, %dx
# CHECK-NEXT: [3,2] . D=========eER. . xorw %bx, %dx
# CHECK-NEXT: [4,0] . D=========eER . addw %cx, %dx
# CHECK-NEXT: [4,1] . D==========eER . movw %ax, %dx
# CHECK-NEXT: [4,2] . D===========eER . xorw %bx, %dx
# CHECK-NEXT: [5,0] . D============eER . addw %cx, %dx
# CHECK-NEXT: [5,1] . D============eER. movw %ax, %dx
# CHECK-NEXT: [5,2] . D=============eER xorw %bx, %dx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 6 7.0 0.2 0.0 addw %cx, %dx
# CHECK-NEXT: 1. 6 7.7 0.0 0.0 movw %ax, %dx
# CHECK-NEXT: 2. 6 8.5 0.0 0.0 xorw %bx, %dx
94 changes: 94 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-4.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=7 < %s | FileCheck %s

# The lzcnt cannot execute in parallel with the imul because there is a false
# dependency on %bx.

imul %ax, %bx
lzcnt %ax, %bx
add %cx, %bx

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 7503
# CHECK-NEXT: Total uOps: 4500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.60
# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 3 1.00 imulw %ax, %bx
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx
# CHECK-NEXT: 1 1 0.25 addw %cx, %bx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - 0.67 1.00 0.67 0.67 - - - - - 1.00

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - - 1.00 - - - - - - - 1.00 imulw %ax, %bx
# CHECK-NEXT: - - 0.33 - 0.33 0.33 - - - - - - lzcntw %ax, %bx
# CHECK-NEXT: - - 0.33 - 0.33 0.33 - - - - - - addw %cx, %bx

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 01234567
# CHECK-NEXT: Index 0123456789 0123456789

# CHECK: [0,0] DeeeER . . . . . . . imulw %ax, %bx
# CHECK-NEXT: [0,1] D===eER . . . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [0,2] D====eER . . . . . . . addw %cx, %bx
# CHECK-NEXT: [1,0] D=====eeeER . . . . . . imulw %ax, %bx
# CHECK-NEXT: [1,1] .D=======eER . . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [1,2] .D========eER . . . . . . addw %cx, %bx
# CHECK-NEXT: [2,0] .D=========eeeER . . . . . imulw %ax, %bx
# CHECK-NEXT: [2,1] .D============eER . . . . . lzcntw %ax, %bx
# CHECK-NEXT: [2,2] . D============eER . . . . . addw %cx, %bx
# CHECK-NEXT: [3,0] . D=============eeeER . . . . imulw %ax, %bx
# CHECK-NEXT: [3,1] . D================eER . . . . lzcntw %ax, %bx
# CHECK-NEXT: [3,2] . D=================eER . . . . addw %cx, %bx
# CHECK-NEXT: [4,0] . D=================eeeER . . . imulw %ax, %bx
# CHECK-NEXT: [4,1] . D====================eER . . . lzcntw %ax, %bx
# CHECK-NEXT: [4,2] . D=====================eER . . . addw %cx, %bx
# CHECK-NEXT: [5,0] . D======================eeeER . . imulw %ax, %bx
# CHECK-NEXT: [5,1] . D========================eER . . lzcntw %ax, %bx
# CHECK-NEXT: [5,2] . D=========================eER . . addw %cx, %bx
# CHECK-NEXT: [6,0] . D==========================eeeER . imulw %ax, %bx
# CHECK-NEXT: [6,1] . D=============================eER. lzcntw %ax, %bx
# CHECK-NEXT: [6,2] . D=============================eER addw %cx, %bx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 7 14.1 0.1 0.0 imulw %ax, %bx
# CHECK-NEXT: 1. 7 16.9 0.0 0.0 lzcntw %ax, %bx
# CHECK-NEXT: 2. 7 17.6 0.0 0.0 addw %cx, %bx
70 changes: 70 additions & 0 deletions llvm/test/tools/llvm-mca/X86/Znver2/partial-reg-update-5.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=znver2 -iterations=1500 -timeline -timeline-max-iterations=8 < %s | FileCheck %s

lzcnt %ax, %bx ## partial register stall.

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 1500
# CHECK-NEXT: Total Cycles: 1503
# CHECK-NEXT: Total uOps: 1500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.3

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.25 lzcntw %ax, %bx

# CHECK: Resources:
# CHECK-NEXT: [0] - Zn2AGU0
# CHECK-NEXT: [1] - Zn2AGU1
# CHECK-NEXT: [2] - Zn2AGU2
# CHECK-NEXT: [3] - Zn2ALU0
# CHECK-NEXT: [4] - Zn2ALU1
# CHECK-NEXT: [5] - Zn2ALU2
# CHECK-NEXT: [6] - Zn2ALU3
# CHECK-NEXT: [7] - Zn2Divider
# CHECK-NEXT: [8] - Zn2FPU0
# CHECK-NEXT: [9] - Zn2FPU1
# CHECK-NEXT: [10] - Zn2FPU2
# CHECK-NEXT: [11] - Zn2FPU3
# CHECK-NEXT: [12] - Zn2Multiplier

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12]
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] Instructions:
# CHECK-NEXT: - - 0.25 0.25 0.25 0.25 - - - - - - lzcntw %ax, %bx

# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . lzcntw %ax, %bx
# CHECK-NEXT: [1,0] D=eER. . lzcntw %ax, %bx
# CHECK-NEXT: [2,0] D==eER . lzcntw %ax, %bx
# CHECK-NEXT: [3,0] D===eER . lzcntw %ax, %bx
# CHECK-NEXT: [4,0] .D===eER . lzcntw %ax, %bx
# CHECK-NEXT: [5,0] .D====eER . lzcntw %ax, %bx
# CHECK-NEXT: [6,0] .D=====eER. lzcntw %ax, %bx
# CHECK-NEXT: [7,0] .D======eER lzcntw %ax, %bx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 8 4.0 0.1 0.0 lzcntw %ax, %bx
Loading

0 comments on commit 3408940

Please sign in to comment.