Skip to content

Commit

Permalink
[MCA] Add tests for IPC on Cortex-A55
Browse files Browse the repository at this point in the history
The tests compare IPC statistics that MCA provides with IPC values
measured on Cortex-A55 hardware. For hardware tests, each snippet is
run in a loop unrolled by 1000, and IPC is measured by linux-perf.

Several tests do not match the hardware: the skewed ALU is not
supported, LDR seem to be missing a forwarding path.

Differential Revision: https://reviews.llvm.org/D98174
  • Loading branch information
asavonic committed Apr 8, 2021
1 parent 3f6753e commit f08a2fc
Show file tree
Hide file tree
Showing 12 changed files with 216 additions and 0 deletions.
14 changes: 14 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-0-single-add.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

add w8, w8, #1

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 1000
# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 1000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 0.5
15 changes: 15 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-1-add-seq.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

add w8, w8, #1
add w9, w9, #1

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 2000
# CHECK-NEXT: Total Cycles: 1003
# CHECK-NEXT: Total uOps: 2000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.99
# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 1.0
15 changes: 15 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-10-fma.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

fmadd s3, s5, s6, s7
fmadd s8, s9, s10, s11

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 2000
# CHECK-NEXT: Total Cycles: 1004
# CHECK-NEXT: Total uOps: 2000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.99
# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 1.0
19 changes: 19 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-11-fma-mix.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

# FMADD writes and retires out-of-order
fmadd s3, s5, s6, s7
# ADD instructions are issued and retire in-order
add w8, w8, #1
add w9, w9, #1
add w10, w10, #1

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 4000
# CHECK-NEXT: Total Cycles: 2003
# CHECK-NEXT: Total uOps: 4000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 2.00
# CHECK-NEXT: IPC: 2.00
# CHECK-NEXT: Block RThroughput: 2.0
18 changes: 18 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-2-skewed-alu.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
# CHECK: IPC:
# CHECK-SAME: 2.00
#
# XFAIL: *
#
# Cortex-A55 has a secondary skewed ALU in the Ex1 stage for simple
# ALU instructions that do not require shifting or saturation
# resources. Results from the skewed ALU are available 1 cycle earlier.
#
# This features allows the first and the second instruction to be
# dual-issued despite a register dependency (w8).
#
# MCA and LLVM scheduling model do not support this yet.

add w8, w8, #1
add w10, w8, #1
add w12, w8, #1
16 changes: 16 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-3-mul.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

add w8, w8, #1
add w12, w8, #1
mul w10, w10, w10

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 3003
# CHECK-NEXT: Total uOps: 3000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 1.5
21 changes: 21 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-4-sdiv.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

# DIV is not modeled precisely: on hardware it takes variable
# number of cycles depending on its operands, but LLVM scheduling
# model only provides an average latency.

add w8, w8, #1
movz w10, #1, lsl #16
movz w12, #32768, lsl #16
sdiv w10, w12, w10

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 4000
# CHECK-NEXT: Total Cycles: 8004
# CHECK-NEXT: Total uOps: 4000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 8.0
22 changes: 22 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-5-mul-sdiv.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

# DIV is not modeled precisely: on hardware it takes variable
# number of cycles depending on its operands. LLVM scheduling model
# only provides an average latency.

add w8, w8, #1
movz w10, #1, lsl #16
movz w12, #32768, lsl #16
mul w11, w8, w8
sdiv w10, w12, w10

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 5000
# CHECK-NEXT: Total Cycles: 8004
# CHECK-NEXT: Total uOps: 5000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.62
# CHECK-NEXT: IPC: 0.62
# CHECK-NEXT: Block RThroughput: 8.0
25 changes: 25 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-6-mul.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

# It appears that ADD and MUL fuse together, if both can be issued in
# one cycle:
#
# add w12, w8, #1
# mul w10, w12, w10
#
# FIXME: MCA (and LLVM scheduling model) do not support this. The test
# case uses different registers to break the pattern.

add w8, w8, #1
add w13, w8, #1
mul w10, w12, w10

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 3003
# CHECK-NEXT: Total uOps: 3000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 1.5
17 changes: 17 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-7-cmp.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

add w8, w8, #1
add w12, w9, #1
cmp w9, #42
mul w10, w12, w10

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 4000
# CHECK-NEXT: Total Cycles: 3004
# CHECK-NEXT: Total uOps: 4000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.33
# CHECK-NEXT: IPC: 1.33
# CHECK-NEXT: Block RThroughput: 2.0
19 changes: 19 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-8-ldr.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s
# CHECK: IPC:
# CHECK-SAME: 1.50
#
# XFAIL: *
#
# MCA reports IPC = 0.60, while hardware shows IPC = 1.50.
#
# 1) The skewed ALU on Cortex-A55 is not modeled: ADD and AND
# instructions should be issued in the same cycle.
# See A55-2.s test for more details.
#
# 2) Cortex-A55 manual mentions that there is a forwarding path from
# the ALU pipeline to the LD/ST pipeline. This is not implemented in
# the LLVM scheduling model.

add w8, w8, #1
and w12, w8, #0x3f
ldr w14, [x10, w12, uxtw #2]
15 changes: 15 additions & 0 deletions llvm/test/tools/llvm-mca/AArch64/Cortex/IPC/A55-9-fabs.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=aarch64 -mcpu=cortex-a55 --all-views=false --summary-view --iterations=1000 < %s | FileCheck %s

fabs s0, s1
fabs s2, s3

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 2000
# CHECK-NEXT: Total Cycles: 1004
# CHECK-NEXT: Total uOps: 2000

# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 1.99
# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 1.0

0 comments on commit f08a2fc

Please sign in to comment.