1,671 changes: 1,671 additions & 0 deletions llvm/test/CodeGen/X86/schedule-x86_64.ll

Large diffs are not rendered by default.

38 changes: 27 additions & 11 deletions llvm/test/CodeGen/X86/small-byval-memcpy.ll
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core2 | FileCheck %s --check-prefix=CORE2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck %s --check-prefix=NEHALEM
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=x86-64 | FileCheck %s --check-prefix=BDVER2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2

declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)

define void @copy16bytes(i8* nocapture %a, i8* nocapture readonly %b) {
; CORE2-LABEL: copy16bytes:
; CORE2: ## %bb.0:
; CORE2-NEXT: movq (%rsi), %rax
; CORE2-NEXT: movq 8(%rsi), %rcx
; CORE2-NEXT: movq %rcx, 8(%rdi)
; CORE2-NEXT: movq %rax, (%rdi)
; CORE2-NEXT: retq
;
; NEHALEM-LABEL: copy16bytes:
; NEHALEM: ## %bb.0:
; NEHALEM-NEXT: movups (%rsi), %xmm0
; NEHALEM-NEXT: movups %xmm0, (%rdi)
; NEHALEM-NEXT: retq
;
; BDVER2-LABEL: copy16bytes:
; BDVER2: ## %bb.0:
; BDVER2-NEXT: movups (%rsi), %xmm0
; BDVER2-NEXT: movups %xmm0, (%rdi)
; BDVER2-NEXT: retq
;
; BTVER2-LABEL: copy16bytes:
; BTVER2: ## %bb.0:
; BTVER2-NEXT: vmovups (%rsi), %xmm0
; BTVER2-NEXT: vmovups %xmm0, (%rdi)
; BTVER2-NEXT: retq
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 16, i1 false)
ret void

; CHECK-LABEL: copy16bytes
; CORE2: movq
; CORE2-NEXT: movq
; CORE2-NEXT: movq
; CORE2-NEXT: movq
; CORE2-NEXT: retq

; NEHALEM: movups
; NEHALEM-NEXT: movups
; NEHALEM-NEXT: retq

; BTVER2: movups
; BTVER2-NEXT: movups
; BTVER2-NEXT: retq
}
688 changes: 688 additions & 0 deletions llvm/test/CodeGen/X86/sse-schedule.ll

Large diffs are not rendered by default.

1,667 changes: 1,667 additions & 0 deletions llvm/test/CodeGen/X86/sse2-schedule.ll

Large diffs are not rendered by default.

156 changes: 155 additions & 1 deletion llvm/test/CodeGen/X86/sse3-schedule.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+sse3 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=+sse3 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-ssse3 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,ZNVER1
Expand Down Expand Up @@ -98,6 +100,18 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; SKX-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_addsubpd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-SSE-NEXT: addsubpd (%rdi), %xmm0 # sched: [9:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_addsubpd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BDVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_addsubpd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: addsubpd %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -207,6 +221,18 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; SKX-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_addsubps:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-SSE-NEXT: addsubps (%rdi), %xmm0 # sched: [9:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_addsubps:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BDVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_addsubps:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: addsubps %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -316,6 +342,18 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; SKX-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_haddpd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: haddpd %xmm1, %xmm0 # sched: [5:2.00]
; BDVER2-SSE-NEXT: haddpd (%rdi), %xmm0 # sched: [11:2.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_haddpd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
; BDVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_haddpd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: haddpd %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -425,6 +463,18 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; SKX-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_haddps:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: haddps %xmm1, %xmm0 # sched: [5:2.00]
; BDVER2-SSE-NEXT: haddps (%rdi), %xmm0 # sched: [11:2.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_haddps:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
; BDVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_haddps:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: haddps %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -534,6 +584,18 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; SKX-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_hsubpd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: hsubpd %xmm1, %xmm0 # sched: [5:2.00]
; BDVER2-SSE-NEXT: hsubpd (%rdi), %xmm0 # sched: [11:2.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_hsubpd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
; BDVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_hsubpd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: hsubpd %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -643,6 +705,18 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; SKX-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [12:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_hsubps:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: hsubps %xmm1, %xmm0 # sched: [5:2.00]
; BDVER2-SSE-NEXT: hsubps (%rdi), %xmm0 # sched: [11:2.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_hsubps:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [5:2.00]
; BDVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [11:2.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_hsubps:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: hsubps %xmm1, %xmm0 # sched: [3:1.00]
Expand Down Expand Up @@ -741,6 +815,16 @@ define <16 x i8> @test_lddqu(i8* %a0) {
; SKX-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_lddqu:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: lddqu (%rdi), %xmm0 # sched: [6:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_lddqu:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [6:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_lddqu:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: lddqu (%rdi), %xmm0 # sched: [5:1.00]
Expand Down Expand Up @@ -857,6 +941,20 @@ define void @test_monitor(i8* %a0, i32 %a1, i32 %a2) {
; SKX-NEXT: monitor # sched: [100:0.25]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_monitor:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl %esi, %ecx # sched: [1:0.33]
; BDVER2-SSE-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
; BDVER2-SSE-NEXT: monitor # sched: [100:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_monitor:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl %esi, %ecx # sched: [1:0.33]
; BDVER2-NEXT: leaq (%rdi), %rax # sched: [1:0.50]
; BDVER2-NEXT: monitor # sched: [100:0.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_monitor:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl %esi, %ecx # sched: [1:0.50]
Expand Down Expand Up @@ -982,6 +1080,20 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; SKX-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_movddup:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:1.00]
; BDVER2-SSE-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] sched: [6:0.50]
; BDVER2-SSE-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_movddup:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
; BDVER2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [6:0.50]
; BDVER2-NEXT: vsubpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_movddup:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movddup {{.*#+}} xmm1 = xmm0[0,0] sched: [1:0.50]
Expand Down Expand Up @@ -1109,6 +1221,20 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_movshdup:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:1.00]
; BDVER2-SSE-NEXT: movshdup {{.*#+}} xmm0 = mem[1,1,3,3] sched: [6:0.50]
; BDVER2-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_movshdup:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
; BDVER2-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [6:0.50]
; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_movshdup:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] sched: [1:0.50]
Expand Down Expand Up @@ -1236,6 +1362,20 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; SKX-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [4:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_movsldup:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:1.00]
; BDVER2-SSE-NEXT: movsldup {{.*#+}} xmm0 = mem[0,0,2,2] sched: [6:0.50]
; BDVER2-SSE-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_movsldup:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
; BDVER2-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [6:0.50]
; BDVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_movsldup:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] sched: [1:0.50]
Expand Down Expand Up @@ -1362,6 +1502,20 @@ define void @test_mwait(i32 %a0, i32 %a1) {
; SKX-NEXT: mwait # sched: [20:2.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_mwait:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl %esi, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl %edi, %ecx # sched: [1:0.33]
; BDVER2-SSE-NEXT: mwait # sched: [100:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_mwait:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl %esi, %eax # sched: [1:0.33]
; BDVER2-NEXT: movl %edi, %ecx # sched: [1:0.33]
; BDVER2-NEXT: mwait # sched: [100:0.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_mwait:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl %esi, %eax # sched: [1:0.50]
Expand Down
651 changes: 651 additions & 0 deletions llvm/test/CodeGen/X86/sse41-schedule.ll

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions llvm/test/CodeGen/X86/sse42-schedule.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx | FileCheck %s --check-prefixes=CHECK,SKX-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4.2,+pclmul -mattr=-avx | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx,+xop -mattr=+sse4.2,+pclmul -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-avx | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
Expand Down Expand Up @@ -103,6 +105,20 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; SKX-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: crc32_32_8:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: crc32b %sil, %eax # sched: [3:1.00]
; BDVER2-SSE-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: crc32_32_8:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-NEXT: crc32b %sil, %eax # sched: [3:1.00]
; BDVER2-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: crc32_32_8:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.50]
Expand Down Expand Up @@ -222,6 +238,20 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; SKX-NEXT: crc32w (%rdx), %eax # sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: crc32_32_16:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: crc32w %si, %eax # sched: [3:1.00]
; BDVER2-SSE-NEXT: crc32w (%rdx), %eax # sched: [8:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: crc32_32_16:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-NEXT: crc32w %si, %eax # sched: [3:1.00]
; BDVER2-NEXT: crc32w (%rdx), %eax # sched: [8:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: crc32_32_16:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.50]
Expand Down Expand Up @@ -341,6 +371,20 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; SKX-NEXT: crc32l (%rdx), %eax # sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: crc32_32_32:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: crc32l %esi, %eax # sched: [3:1.00]
; BDVER2-SSE-NEXT: crc32l (%rdx), %eax # sched: [8:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: crc32_32_32:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl %edi, %eax # sched: [1:0.33]
; BDVER2-NEXT: crc32l %esi, %eax # sched: [3:1.00]
; BDVER2-NEXT: crc32l (%rdx), %eax # sched: [8:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: crc32_32_32:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl %edi, %eax # sched: [1:0.50]
Expand Down Expand Up @@ -460,6 +504,20 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; SKX-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: crc32_64_8:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movq %rdi, %rax # sched: [1:0.33]
; BDVER2-SSE-NEXT: crc32b %sil, %eax # sched: [3:1.00]
; BDVER2-SSE-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: crc32_64_8:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movq %rdi, %rax # sched: [1:0.33]
; BDVER2-NEXT: crc32b %sil, %eax # sched: [3:1.00]
; BDVER2-NEXT: crc32b (%rdx), %eax # sched: [8:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: crc32_64_8:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movq %rdi, %rax # sched: [1:0.50]
Expand Down Expand Up @@ -579,6 +637,20 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; SKX-NEXT: crc32q (%rdx), %rax # sched: [8:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: crc32_64_64:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movq %rdi, %rax # sched: [1:0.33]
; BDVER2-SSE-NEXT: crc32q %rsi, %rax # sched: [3:1.00]
; BDVER2-SSE-NEXT: crc32q (%rdx), %rax # sched: [8:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: crc32_64_64:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movq %rdi, %rax # sched: [1:0.33]
; BDVER2-NEXT: crc32q %rsi, %rax # sched: [3:1.00]
; BDVER2-NEXT: crc32q (%rdx), %rax # sched: [8:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: crc32_64_64:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movq %rdi, %rax # sched: [1:0.50]
Expand Down Expand Up @@ -770,6 +842,32 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pcmpestri:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-SSE-NEXT: pcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
; BDVER2-SSE-NEXT: movl %ecx, %esi # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-SSE-NEXT: pcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
; BDVER2-SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
; BDVER2-SSE-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pcmpestri:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [4:2.67]
; BDVER2-NEXT: movl %ecx, %esi # sched: [1:0.33]
; BDVER2-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [4:2.33]
; BDVER2-NEXT: # kill: def $ecx killed $ecx def $rcx
; BDVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pcmpestri:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.50]
Expand Down Expand Up @@ -950,6 +1048,26 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [25:4.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pcmpestrm:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-SSE-NEXT: pcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
; BDVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-SSE-NEXT: pcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pcmpestrm:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [11:2.67]
; BDVER2-NEXT: movl $7, %eax # sched: [1:0.33]
; BDVER2-NEXT: movl $7, %edx # sched: [1:0.33]
; BDVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [11:2.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pcmpestrm:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: movl $7, %eax # sched: [1:0.50]
Expand Down Expand Up @@ -1105,6 +1223,24 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pcmpistri:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; BDVER2-SSE-NEXT: movl %ecx, %eax # sched: [1:0.33]
; BDVER2-SSE-NEXT: pcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
; BDVER2-SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
; BDVER2-SSE-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pcmpistri:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [11:3.00]
; BDVER2-NEXT: movl %ecx, %eax # sched: [1:0.33]
; BDVER2-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [17:3.00]
; BDVER2-NEXT: # kill: def $ecx killed $ecx def $rcx
; BDVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pcmpistri:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pcmpistri $7, %xmm1, %xmm0 # sched: [7:2.00]
Expand Down Expand Up @@ -1221,6 +1357,18 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [16:3.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pcmpistrm:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
; BDVER2-SSE-NEXT: pcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pcmpistrm:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [11:3.00]
; BDVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [17:3.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pcmpistrm:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pcmpistrm $7, %xmm1, %xmm0 # sched: [8:2.00]
Expand Down Expand Up @@ -1324,6 +1472,18 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SKX-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pcmpgtq:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [5:1.00]
; BDVER2-SSE-NEXT: pcmpgtq (%rdi), %xmm0 # sched: [11:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pcmpgtq:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpcomgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpcomgtq (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pcmpgtq:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pcmpgtq %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1428,6 +1588,18 @@ define <2 x i64> @test_pclmulqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; SKX-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pclmulqdq:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pclmulqdq $0, %xmm1, %xmm0 # sched: [14:6.00]
; BDVER2-SSE-NEXT: pclmulqdq $0, (%rdi), %xmm0 # sched: [14:5.67]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pclmulqdq:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0 # sched: [14:6.00]
; BDVER2-NEXT: vpclmulqdq $0, (%rdi), %xmm0, %xmm0 # sched: [14:5.67]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pclmulqdq:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pclmulqdq $0, %xmm1, %xmm0 # sched: [2:1.00]
Expand Down
31 changes: 31 additions & 0 deletions llvm/test/CodeGen/X86/sse4a-schedule.ll
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+sse4a | FileCheck %s --check-prefix=BDVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1

Expand All @@ -9,6 +10,11 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
; GENERIC-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_extrq:
; BDVER2: # %bb.0:
; BDVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_extrq:
; BTVER2: # %bb.0:
; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50]
Expand All @@ -29,6 +35,11 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
; GENERIC-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_extrqi:
; BDVER2: # %bb.0:
; BDVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_extrqi:
; BTVER2: # %bb.0:
; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50]
Expand All @@ -49,6 +60,11 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
; GENERIC-NEXT: insertq %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_insertq:
; BDVER2: # %bb.0:
; BDVER2-NEXT: insertq %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_insertq:
; BTVER2: # %bb.0:
; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00]
Expand All @@ -69,6 +85,11 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
; GENERIC-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_insertqi:
; BDVER2: # %bb.0:
; BDVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_insertqi:
; BTVER2: # %bb.0:
; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
Expand All @@ -89,6 +110,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
; GENERIC-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_movntsd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_movntsd:
; BTVER2: # %bb.0:
; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [3:1.00]
Expand All @@ -109,6 +135,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
; GENERIC-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_movntss:
; BDVER2: # %bb.0:
; BDVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-LABEL: test_movntss:
; BTVER2: # %bb.0:
; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [3:1.00]
Expand Down
201 changes: 201 additions & 0 deletions llvm/test/CodeGen/X86/ssse3-schedule.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKYLAKE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,SKX-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,SKX
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+ssse3 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BDVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx -mattr=+ssse3 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BDVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,BTVER2-SSE
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 -mattr=-avx2 | FileCheck %s --check-prefixes=CHECK,BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 -mattr=-sse4.1 | FileCheck %s --check-prefixes=CHECK,ZNVER1-SSE
Expand Down Expand Up @@ -113,6 +115,20 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pabsb:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50]
; BDVER2-SSE-NEXT: pabsb (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pabsb:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpabsb (%rdi), %xmm1 # sched: [7:0.50]
; BDVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pabsb:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pabsb %xmm0, %xmm1 # sched: [1:0.50]
Expand Down Expand Up @@ -242,6 +258,20 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pabsd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50]
; BDVER2-SSE-NEXT: pabsd (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pabsd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpabsd (%rdi), %xmm1 # sched: [7:0.50]
; BDVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pabsd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pabsd %xmm0, %xmm1 # sched: [1:0.50]
Expand Down Expand Up @@ -371,6 +401,20 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; SKX-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pabsw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pabsw %xmm0, %xmm1 # sched: [1:0.50]
; BDVER2-SSE-NEXT: pabsw (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: por %xmm1, %xmm0 # sched: [1:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pabsw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpabsw (%rdi), %xmm1 # sched: [7:0.50]
; BDVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pabsw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pabsw %xmm0, %xmm1 # sched: [1:0.50]
Expand Down Expand Up @@ -495,6 +539,19 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_palignr:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
; BDVER2-SSE-NEXT: palignr {{.*#+}} xmm1 = mem[14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
; BDVER2-SSE-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.33]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_palignr:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
; BDVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_palignr:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: palignr {{.*#+}} xmm1 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
Expand Down Expand Up @@ -605,6 +662,18 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SKX-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phaddd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phaddd %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phaddd (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phaddd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phaddd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phaddd %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -714,6 +783,18 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phaddsw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phaddsw %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phaddsw (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phaddsw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phaddsw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phaddsw %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -823,6 +904,18 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phaddw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phaddw %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phaddw (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phaddw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phaddw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phaddw %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -932,6 +1025,18 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SKX-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phsubd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phsubd %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phsubd (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phsubd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phsubd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phsubd %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1041,6 +1146,18 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phsubsw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phsubsw %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phsubsw (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phsubsw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phsubsw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phsubsw %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1150,6 +1267,18 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:2.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_phsubw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: phsubw %xmm1, %xmm0 # sched: [3:1.50]
; BDVER2-SSE-NEXT: phsubw (%rdi), %xmm0 # sched: [9:1.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_phsubw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [3:1.50]
; BDVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [9:1.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_phsubw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: phsubw %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1259,6 +1388,18 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pmaddubsw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [5:1.00]
; BDVER2-SSE-NEXT: pmaddubsw (%rdi), %xmm0 # sched: [11:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pmaddubsw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; BDVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pmaddubsw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pmaddubsw %xmm1, %xmm0 # sched: [2:1.00]
Expand Down Expand Up @@ -1369,6 +1510,18 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [10:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pmulhrsw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [5:1.00]
; BDVER2-SSE-NEXT: pmulhrsw (%rdi), %xmm0 # sched: [11:1.00]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pmulhrsw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; BDVER2-NEXT: vpmulhrsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pmulhrsw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pmulhrsw %xmm1, %xmm0 # sched: [2:1.00]
Expand Down Expand Up @@ -1478,6 +1631,18 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_pshufb:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: pshufb %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-SSE-NEXT: pshufb (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_pshufb:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_pshufb:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: pshufb %xmm1, %xmm0 # sched: [2:2.00]
Expand Down Expand Up @@ -1591,6 +1756,18 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; SKX-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_psignb:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-SSE-NEXT: psignb (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_psignb:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_psignb:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: psignb %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1704,6 +1881,18 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; SKX-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_psignd:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-SSE-NEXT: psignd (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_psignd:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_psignd:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: psignd %xmm1, %xmm0 # sched: [1:0.50]
Expand Down Expand Up @@ -1817,6 +2006,18 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; SKX-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; SKX-NEXT: retq # sched: [7:1.00]
;
; BDVER2-SSE-LABEL: test_psignw:
; BDVER2-SSE: # %bb.0:
; BDVER2-SSE-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
; BDVER2-SSE-NEXT: psignw (%rdi), %xmm0 # sched: [7:0.50]
; BDVER2-SSE-NEXT: retq # sched: [1:1.00]
;
; BDVER2-LABEL: test_psignw:
; BDVER2: # %bb.0:
; BDVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BDVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [7:0.50]
; BDVER2-NEXT: retq # sched: [1:1.00]
;
; BTVER2-SSE-LABEL: test_psignw:
; BTVER2-SSE: # %bb.0:
; BTVER2-SSE-NEXT: psignw %xmm1, %xmm0 # sched: [1:0.50]
Expand Down
526 changes: 405 additions & 121 deletions llvm/test/CodeGen/X86/tbm-schedule.ll

Large diffs are not rendered by default.

562 changes: 562 additions & 0 deletions llvm/test/CodeGen/X86/x87-schedule.ll

Large diffs are not rendered by default.

1,558 changes: 1,189 additions & 369 deletions llvm/test/CodeGen/X86/xop-schedule.ll

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/add-sequence.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1000 -timeline < %s | FileCheck %s

add %eax, %ecx
add %esi, %eax
add %eax, %edx

# CHECK: Iterations: 1000
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 1004
# CHECK-NEXT: Total uOps: 3000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 2.99
# CHECK-NEXT: IPC: 2.99
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.33 addl %eax, %ecx
# CHECK-NEXT: 1 1 0.33 addl %esi, %eax
# CHECK-NEXT: 1 1 0.33 addl %eax, %edx

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.00 1.00 - 1.00 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - 1.00 - - - - addl %eax, %ecx
# CHECK-NEXT: - - - - - 1.00 - - addl %esi, %eax
# CHECK-NEXT: - - 1.00 - - - - - addl %eax, %edx

# CHECK: Timeline view:
# CHECK-NEXT: 0123
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . . addl %eax, %ecx
# CHECK-NEXT: [0,1] DeER . . . addl %esi, %eax
# CHECK-NEXT: [0,2] D=eER. . . addl %eax, %edx
# CHECK-NEXT: [1,0] D=eER. . . addl %eax, %ecx
# CHECK-NEXT: [1,1] .DeER. . . addl %esi, %eax
# CHECK-NEXT: [1,2] .D=eER . . addl %eax, %edx
# CHECK-NEXT: [2,0] .D=eER . . addl %eax, %ecx
# CHECK-NEXT: [2,1] .D=eER . . addl %esi, %eax
# CHECK-NEXT: [2,2] . D=eER . . addl %eax, %edx
# CHECK-NEXT: [3,0] . D=eER . . addl %eax, %ecx
# CHECK-NEXT: [3,1] . D=eER . . addl %esi, %eax
# CHECK-NEXT: [3,2] . D==eER . . addl %eax, %edx
# CHECK-NEXT: [4,0] . D=eER . . addl %eax, %ecx
# CHECK-NEXT: [4,1] . D=eER . . addl %esi, %eax
# CHECK-NEXT: [4,2] . D==eER . . addl %eax, %edx
# CHECK-NEXT: [5,0] . D==eER . . addl %eax, %ecx
# CHECK-NEXT: [5,1] . D=eER . . addl %esi, %eax
# CHECK-NEXT: [5,2] . D==eER. . addl %eax, %edx
# CHECK-NEXT: [6,0] . D==eER. . addl %eax, %ecx
# CHECK-NEXT: [6,1] . D==eER. . addl %esi, %eax
# CHECK-NEXT: [6,2] . D==eER . addl %eax, %edx
# CHECK-NEXT: [7,0] . D==eER . addl %eax, %ecx
# CHECK-NEXT: [7,1] . D==eER . addl %esi, %eax
# CHECK-NEXT: [7,2] . D===eER . addl %eax, %edx
# CHECK-NEXT: [8,0] . .D==eER . addl %eax, %ecx
# CHECK-NEXT: [8,1] . .D==eER . addl %esi, %eax
# CHECK-NEXT: [8,2] . .D===eER. addl %eax, %edx
# CHECK-NEXT: [9,0] . .D===eER. addl %eax, %ecx
# CHECK-NEXT: [9,1] . . D==eER. addl %esi, %eax
# CHECK-NEXT: [9,2] . . D===eER addl %eax, %edx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 10 2.5 0.1 0.0 addl %eax, %ecx
# CHECK-NEXT: 1. 10 2.2 0.1 0.0 addl %esi, %eax
# CHECK-NEXT: 2. 10 3.0 0.0 0.0 addl %eax, %edx
63 changes: 63 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-1.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s

## Sets register RAX.
imulq $5, %rcx, %rax

## Kills the previous definition of RAX.
## The upper portion of RAX is cleared.
lzcnt %ecx, %eax

## The AND can start immediately after the LZCNT.
## It doesn't need to wait for the IMUL.
and %rcx, %rax
bsf %rax, %rcx

# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 400
# CHECK-NEXT: Total Cycles: 803
# CHECK-NEXT: Total uOps: 400

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 3 1.00 imulq $5, %rcx, %rax
# CHECK-NEXT: 1 3 1.00 lzcntl %ecx, %eax
# CHECK-NEXT: 1 1 0.33 andq %rcx, %rax
# CHECK-NEXT: 1 3 1.00 bsfq %rax, %rcx

# CHECK: Timeline view:
# CHECK-NEXT: 012345678
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeeeER . . . imulq $5, %rcx, %rax
# CHECK-NEXT: [0,1] D=eeeER . . . lzcntl %ecx, %eax
# CHECK-NEXT: [0,2] D====eER . . . andq %rcx, %rax
# CHECK-NEXT: [0,3] D=====eeeER . . bsfq %rax, %rcx
# CHECK-NEXT: [1,0] .D=======eeeER . . imulq $5, %rcx, %rax
# CHECK-NEXT: [1,1] .D========eeeER. . lzcntl %ecx, %eax
# CHECK-NEXT: [1,2] .D===========eER . andq %rcx, %rax
# CHECK-NEXT: [1,3] .D============eeeER bsfq %rax, %rcx

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 4.5 0.5 0.0 imulq $5, %rcx, %rax
# CHECK-NEXT: 1. 2 5.5 1.5 0.0 lzcntl %ecx, %eax
# CHECK-NEXT: 2. 2 8.5 0.0 0.0 andq %rcx, %rax
# CHECK-NEXT: 3. 2 9.5 0.0 0.0 bsfq %rax, %rcx
137 changes: 137 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/clear-super-register-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -resource-pressure=false -timeline -timeline-max-iterations=2 < %s | FileCheck %s

# In this test, the VDIVPS takes 38 cycles to write to register YMM3. The first
# VADDPS does not depend on the VDIVPS (the WAW dependency is eliminated at
# register renaming stage). So the first VADDPS can be executed in parallel to
# the VDIVPS. That VADDPS also writes to register XMM3, and the upper half of
# YMM3 is implicitly cleared. As a consequence, the definition of YMM3 from the
# VDIVPS is killed, and the subsequent VADDPS instructions don't need to wait
# for the VDIVPS to complete.
# The block reciprocal throughput is limited by the VDIVPS reciprocal throughput
# (which is 38 cycles). The sequence of VADDPS can be executed in parallel on
# the FPA unit; their latency is "hidden" by the long latency of the VDIVPS.

vdivps %ymm0, %ymm1, %ymm3
vaddps %xmm0, %xmm1, %xmm3
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vaddps %ymm3, %ymm1, %ymm4
vandps %xmm4, %xmm1, %xmm0

# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 1800
# CHECK-NEXT: Total Cycles: 2804
# CHECK-NEXT: Total uOps: 2000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.71
# CHECK-NEXT: IPC: 0.64
# CHECK-NEXT: Block RThroughput: 28.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 3 29 28.00 vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: 1 3 1.00 vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 3 1.00 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 1 1 1.00 vandps %xmm4, %xmm1, %xmm0

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 0123456789

# CHECK: [0,0] DeeeeeeeeeeeeeeeeeeeeeeeeeeeeeER . . . . . . vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: [0,1] DeeeE--------------------------R . . . . . . vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: [0,2] .D==eeeE-----------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,3] .D===eeeE----------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,4] .D====eeeE---------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,5] .D=====eeeE--------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,6] . D=====eeeE-------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,7] . D======eeeE------------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,8] . D=======eeeE-----------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,9] . D========eeeE----------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,10] . D========eeeE---------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,11] . D=========eeeE--------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,12] . D==========eeeE-------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,13] . D===========eeeE------------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,14] . D===========eeeE-----------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,15] . D============eeeE----------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,16] . D=============eeeE---------R . . . . . . vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [0,17] . D================eE--------R . . . . . . vandps %xmm4, %xmm1, %xmm0
# CHECK-NEXT: [1,0] . D=======================eeeeeeeeeeeeeeeeeeeeeeeeeeeeeER vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: [1,1] . D================eeeE---------------------------------R vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: [1,2] . .D==================eeeE------------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,3] . .D===================eeeE-----------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,4] . .D====================eeeE----------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,5] . .D=====================eeeE---------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,6] . . D=====================eeeE--------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,7] . . D======================eeeE-------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,8] . . D=======================eeeE------------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,9] . . D========================eeeE-----------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,10] . . D========================eeeE----------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,11] . . D=========================eeeE---------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,12] . . D==========================eeeE--------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,13] . . D===========================eeeE-------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,14] . . D===========================eeeE------------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,15] . . D============================eeeE-----------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,16] . . D=============================eeeE----------------R vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: [1,17] . . D================================eE---------------R vandps %xmm4, %xmm1, %xmm0

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 12.5 4.0 0.0 vdivps %ymm0, %ymm1, %ymm3
# CHECK-NEXT: 1. 2 9.0 0.5 29.5 vaddps %xmm0, %xmm1, %xmm3
# CHECK-NEXT: 2. 2 11.0 0.0 26.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 3. 2 12.0 1.0 25.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 4. 2 13.0 2.0 24.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 5. 2 14.0 3.0 23.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 6. 2 14.0 4.0 22.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 7. 2 15.0 5.0 21.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 8. 2 16.0 6.0 20.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 9. 2 17.0 7.0 19.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 10. 2 17.0 8.0 18.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 11. 2 18.0 9.0 17.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 12. 2 19.0 10.0 16.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 13. 2 20.0 11.0 15.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 14. 2 20.0 12.0 14.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 15. 2 21.0 13.0 13.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 16. 2 22.0 14.0 12.5 vaddps %ymm3, %ymm1, %ymm4
# CHECK-NEXT: 17. 2 25.0 0.0 11.5 vandps %xmm4, %xmm1, %xmm0
72 changes: 72 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-cmp.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s

# Perf stat reports an IPC of 1.97 for this block of code.

# The CMP instruction doesn't depend on the value of EAX. It can set the flags
# without having to read the inputs.

cmp %eax, %eax
cmovae %ebx, %eax

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 4503
# CHECK-NEXT: Total uOps: 4500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 0.67
# CHECK-NEXT: Block RThroughput: 0.8

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.33 cmpl %eax, %eax
# CHECK-NEXT: 2 2 0.67 cmovael %ebx, %eax

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.00 1.00 - 1.00 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - - - 1.00 - - cmpl %eax, %eax
# CHECK-NEXT: - - 1.00 1.00 - - - - cmovael %ebx, %eax

# CHECK: Timeline view:
# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . .. cmpl %eax, %eax
# CHECK-NEXT: [0,1] D=eeER .. cmovael %ebx, %eax
# CHECK-NEXT: [1,0] D===eER .. cmpl %eax, %eax
# CHECK-NEXT: [1,1] .D===eeER .. cmovael %ebx, %eax
# CHECK-NEXT: [2,0] .D=====eER.. cmpl %eax, %eax
# CHECK-NEXT: [2,1] . D=====eeER cmovael %ebx, %eax

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 3.7 0.3 0.0 cmpl %eax, %eax
# CHECK-NEXT: 1. 3 4.0 0.0 0.0 cmovael %ebx, %eax
87 changes: 87 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpeq.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s

# perf stat reports an IPC of 2.00 for this block of code.

# All of the vector packed compares from this test are dependency breaking
# instructions. That means, there is no RAW dependency between any of the
# instructions, and the code can be fully parallelized in hardware.

vpcmpeqb %xmm0, %xmm0, %xmm1
vpcmpeqw %xmm1, %xmm1, %xmm2
vpcmpeqd %xmm2, %xmm2, %xmm3
vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 6000
# CHECK-NEXT: Total Cycles: 6003
# CHECK-NEXT: Total uOps: 6000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 1.00
# CHECK-NEXT: Block RThroughput: 2.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.50 vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1 1 0.50 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.50 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 1 1 0.50 vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - - 2.00 - 2.00 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - - - 1.00 - - vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: - - - 1.00 - - - - vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: - - - - - 1.00 - - vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: - - - 1.00 - - - - vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [0,1] D=eER. . . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] D==eER . . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [0,3] D===eER . . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [1,0] .D===eER . . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [1,1] .D====eER . . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] .D=====eER. . vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [1,3] .D======eER . vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [2,0] . D======eER . vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [2,1] . D=======eER . vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [2,2] . D========eER. vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [2,3] . D=========eER vpcmpeqq %xmm3, %xmm3, %xmm0

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 4.0 0.3 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 3 5.0 0.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 6.0 0.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 7.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
87 changes: 87 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-pcmpgt.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s

# perf stat reports an IPC of 2.00 for this block of code.

# All of the vector packed compares from this test are zero idioms. These zero
# idioms are all detected and removed by the register renamer. That means, no
# uOp is executed, and there is no RAW dependency for any of the packed
# compares.

vpcmpgtb %xmm0, %xmm0, %xmm1
vpcmpgtw %xmm1, %xmm1, %xmm2
vpcmpgtd %xmm2, %xmm2, %xmm3
vpcmpgtq %xmm3, %xmm3, %xmm0

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 6000
# CHECK-NEXT: Total Cycles: 1501
# CHECK-NEXT: Total uOps: 6000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 4.00
# CHECK-NEXT: IPC: 4.00
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 0 0.25 vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1 0 0.25 vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 1 0 0.25 vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 1 0 0.25 vpcmpgtq %xmm3, %xmm3, %xmm0

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - - - - - - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - - - - - - vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: - - - - - - - - vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: - - - - - - - - vpcmpgtq %xmm3, %xmm3, %xmm0

# CHECK: Timeline view:
# CHECK-NEXT: Index 0123

# CHECK: [0,0] DR . vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [0,1] DR . vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [0,2] DR . vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [0,3] DR . vpcmpgtq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [1,0] .DR. vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [1,1] .DR. vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [1,2] .DR. vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [1,3] .DR. vpcmpgtq %xmm3, %xmm3, %xmm0
# CHECK-NEXT: [2,0] . DR vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [2,1] . DR vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: [2,2] . DR vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [2,3] . DR vpcmpgtq %xmm3, %xmm3, %xmm0

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 0.0 0.0 0.0 vpcmpgtb %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 3 0.0 0.0 0.0 vpcmpgtw %xmm1, %xmm1, %xmm2
# CHECK-NEXT: 2. 3 0.0 0.0 0.0 vpcmpgtd %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3. 3 0.0 0.0 0.0 vpcmpgtq %xmm3, %xmm3, %xmm0
73 changes: 73 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-1.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s

# perf stat reports an IPC of 1.00 for this code block.

# Although both SBB are dependency breaking instructions, there is still an
# implicit dependency on EFLAGS which limits the ILP. So, the hardware backend
# can only execute one instruction per cycle.

sbb %edx, %edx
sbb %eax, %eax

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 3000
# CHECK-NEXT: Total Cycles: 6003
# CHECK-NEXT: Total uOps: 6000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.00
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 2 2 0.67 sbbl %edx, %edx
# CHECK-NEXT: 2 2 0.67 sbbl %eax, %eax

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.33 1.33 - 1.33 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - 0.67 0.67 - 0.67 - - sbbl %edx, %edx
# CHECK-NEXT: - - 0.67 0.67 - 0.67 - - sbbl %eax, %eax

# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeeER. . . sbbl %edx, %edx
# CHECK-NEXT: [0,1] D==eeER . . sbbl %eax, %eax
# CHECK-NEXT: [1,0] .D===eeER . . sbbl %edx, %edx
# CHECK-NEXT: [1,1] .D=====eeER . sbbl %eax, %eax
# CHECK-NEXT: [2,0] . D======eeER . sbbl %edx, %edx
# CHECK-NEXT: [2,1] . D========eeER sbbl %eax, %eax

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 4.0 0.3 0.0 sbbl %edx, %edx
# CHECK-NEXT: 1. 3 6.0 0.0 0.0 sbbl %eax, %eax
80 changes: 80 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependency-breaking-sbb-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -timeline -timeline-max-iterations=3 -iterations=1500 < %s | FileCheck %s

# perf stat reports a throughput of 1.51 IPC for this block of code.

# The SBB does not depend on the value of register EAX. That means, it doesn't
# have to wait for the IMUL to write-back on EAX. However, it still depends on
# the ADD for EFLAGS.

imul %edx, %eax
add %edx, %edx
sbb %eax, %eax

# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
# CHECK-NEXT: Total Cycles: 7503
# CHECK-NEXT: Total uOps: 6000

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.80
# CHECK-NEXT: IPC: 0.60
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 3 1.00 imull %edx, %eax
# CHECK-NEXT: 1 1 0.33 addl %edx, %edx
# CHECK-NEXT: 2 2 0.67 sbbl %eax, %eax

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.33 1.33 - 1.33 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - 1.00 - - - - imull %edx, %eax
# CHECK-NEXT: - - 0.33 0.33 - 0.34 - - addl %edx, %edx
# CHECK-NEXT: - - 1.00 - - 1.00 - - sbbl %eax, %eax

# CHECK: Timeline view:
# CHECK-NEXT: 01234567
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeeeER . . . imull %edx, %eax
# CHECK-NEXT: [0,1] DeE--R . . . addl %edx, %edx
# CHECK-NEXT: [0,2] D===eeER . . . sbbl %eax, %eax
# CHECK-NEXT: [1,0] .D====eeeER . . imull %edx, %eax
# CHECK-NEXT: [1,1] .DeE------R . . addl %edx, %edx
# CHECK-NEXT: [1,2] .D=======eeER . . sbbl %eax, %eax
# CHECK-NEXT: [2,0] . D========eeeER . imull %edx, %eax
# CHECK-NEXT: [2,1] . DeE----------R . addl %edx, %edx
# CHECK-NEXT: [2,2] . D===========eeER sbbl %eax, %eax

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 5.0 0.3 0.0 imull %edx, %eax
# CHECK-NEXT: 1. 3 1.0 0.3 6.0 addl %edx, %edx
# CHECK-NEXT: 2. 3 8.0 0.0 0.0 sbbl %eax, %eax
95 changes: 95 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dependent-pmuld-paddd.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=500 -timeline < %s | FileCheck %s

vpmuld %xmm0, %xmm0, %xmm1
vpaddd %xmm1, %xmm1, %xmm0
vpaddd %xmm0, %xmm0, %xmm3

# CHECK: Iterations: 500
# CHECK-NEXT: Instructions: 1500
# CHECK-NEXT: Total Cycles: 3004
# CHECK-NEXT: Total uOps: 1500

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.50
# CHECK-NEXT: IPC: 0.50
# CHECK-NEXT: Block RThroughput: 1.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 5 1.00 vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1 1 0.50 vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: 1 1 0.50 vpaddd %xmm0, %xmm0, %xmm3

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.00 1.00 - 1.00 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - 1.00 - - - - - vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: - - - - - 1.00 - - vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: - - - 1.00 - - - - vpaddd %xmm0, %xmm0, %xmm3

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0123456789 0123456789
# CHECK-NEXT: Index 0123456789 0123456789 0123456789 0123

# CHECK: [0,0] DeeeeeER . . . . . . . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [0,1] D=====eER . . . . . . . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [0,2] D======eER. . . . . . . . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [1,0] D======eeeeeER . . . . . . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [1,1] .D==========eER. . . . . . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [1,2] .D===========eER . . . . . . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [2,0] .D===========eeeeeER. . . . . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [2,1] .D================eER . . . . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [2,2] . D================eER . . . . . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [3,0] . D================eeeeeER . . . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [3,1] . D=====================eER . . . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [3,2] . D======================eER . . . . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [4,0] . D=====================eeeeeER . . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [4,1] . D==========================eER . . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [4,2] . D===========================eER . . . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [5,0] . D===========================eeeeeER . . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [5,1] . D===============================eER . . . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [5,2] . D================================eER. . . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [6,0] . D================================eeeeeER . . . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [6,1] . D=====================================eER. . . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [6,2] . D=====================================eER . . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [7,0] . D=====================================eeeeeER. . . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [7,1] . D==========================================eER . . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [7,2] . D===========================================eER . . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [8,0] . .D==========================================eeeeeER . . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [8,1] . .D===============================================eER . . vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [8,2] . .D================================================eER . . vpaddd %xmm0, %xmm0, %xmm3
# CHECK-NEXT: [9,0] . .D================================================eeeeeER . vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: [9,1] . . D====================================================eER. vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: [9,2] . . D=====================================================eER vpaddd %xmm0, %xmm0, %xmm3

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 10 25.0 0.1 0.0 vpmuldq %xmm0, %xmm0, %xmm1
# CHECK-NEXT: 1. 10 29.7 0.0 0.0 vpaddd %xmm1, %xmm1, %xmm0
# CHECK-NEXT: 2. 10 30.5 0.0 0.0 vpaddd %xmm0, %xmm0, %xmm3
74 changes: 74 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/dot-product.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=300 -timeline -timeline-max-iterations=3 < %s | FileCheck %s

vmulps %xmm0, %xmm1, %xmm2
vhaddps %xmm2, %xmm2, %xmm3
vhaddps %xmm3, %xmm3, %xmm4

# CHECK: Iterations: 300
# CHECK-NEXT: Instructions: 900
# CHECK-NEXT: Total Cycles: 1211
# CHECK-NEXT: Total uOps: 2100

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 1.73
# CHECK-NEXT: IPC: 0.74
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 5 1.00 vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 3 5 2.00 vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 3 5 2.00 vhaddps %xmm3, %xmm3, %xmm4

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - 1.00 2.00 - 4.00 - -

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - 1.00 - - - - - vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - 1.00 - 2.00 - - vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: - - - 1.00 - 2.00 - - vhaddps %xmm3, %xmm3, %xmm4

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789
# CHECK-NEXT: Index 0123456789 012

# CHECK: [0,0] DeeeeeER . . . . vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: [0,1] D=====eeeeeER . . . vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [0,2] .D==========eeeeeER . . vhaddps %xmm3, %xmm3, %xmm4
# CHECK-NEXT: [1,0] .DeeeeeE----------R . . vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: [1,1] . D=====eeeeeE----R . . vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [1,2] . D==========eeeeeER . vhaddps %xmm3, %xmm3, %xmm4
# CHECK-NEXT: [2,0] . DeeeeeE----------R . vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: [2,1] . D=====eeeeeE----R . vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: [2,2] . D==========eeeeeER vhaddps %xmm3, %xmm3, %xmm4

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 3 1.0 1.0 6.7 vmulps %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1. 3 6.0 0.7 2.7 vhaddps %xmm2, %xmm2, %xmm3
# CHECK-NEXT: 2. 3 11.0 1.0 0.0 vhaddps %xmm3, %xmm3, %xmm4
44 changes: 44 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-1.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s

vshufps $0, %xmm0, %xmm1, %xmm1
vhaddps (%rdi), %xmm1, %xmm2

# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 15
# CHECK-NEXT: Total uOps: 5

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.33
# CHECK-NEXT: IPC: 0.13
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 1.00 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 4 11 2.00 * vhaddps (%rdi), %xmm1, %xmm2

# CHECK: Timeline view:
# CHECK-NEXT: 01234
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . . vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: [0,1] .DeeeeeeeeeeeER vhaddps (%rdi), %xmm1, %xmm2

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vhaddps (%rdi), %xmm1, %xmm2
44 changes: 44 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/hadd-read-after-ld-2.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=1 -timeline -resource-pressure=false < %s | FileCheck %s

vshufps $0, %xmm0, %xmm1, %xmm1
vhaddps (%rdi), %ymm1, %ymm2

# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 16
# CHECK-NEXT: Total uOps: 5

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.31
# CHECK-NEXT: IPC: 0.13
# CHECK-NEXT: Block RThroughput: 3.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 1.00 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 4 12 2.00 * vhaddps (%rdi), %ymm1, %ymm2

# CHECK: Timeline view:
# CHECK-NEXT: 012345
# CHECK-NEXT: Index 0123456789

# CHECK: [0,0] DeER . . . vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: [0,1] .DeeeeeeeeeeeeER vhaddps (%rdi), %ymm1, %ymm2

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vshufps $0, %xmm0, %xmm1, %xmm1
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 vhaddps (%rdi), %ymm1, %ymm2
36 changes: 36 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/instruction-info-view.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=true < %s | FileCheck %s --check-prefix=ENABLED
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info=false < %s | FileCheck %s -check-prefix=DISABLED
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false -instruction-info < %s | FileCheck %s -check-prefix=ENABLED
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -resource-pressure=false < %s | FileCheck %s -check-prefix=ENABLED

vmulps %xmm0, %xmm1, %xmm2
vhaddps %xmm2, %xmm2, %xmm3
vhaddps %xmm3, %xmm3, %xmm4

# DISABLED-NOT: Instruction Info:


# ENABLED: Iterations: 100
# ENABLED-NEXT: Instructions: 300
# ENABLED-NEXT: Total Cycles: 414
# ENABLED-NEXT: Total uOps: 700


# ENABLED: Dispatch Width: 4
# ENABLED-NEXT: uOps Per Cycle: 1.69
# ENABLED-NEXT: IPC: 0.72
# ENABLED-NEXT: Block RThroughput: 4.0

# ENABLED: Instruction Info:
# ENABLED-NEXT: [1]: #uOps
# ENABLED-NEXT: [2]: Latency
# ENABLED-NEXT: [3]: RThroughput
# ENABLED-NEXT: [4]: MayLoad
# ENABLED-NEXT: [5]: MayStore
# ENABLED-NEXT: [6]: HasSideEffects (U)

# ENABLED: [1] [2] [3] [4] [5] [6] Instructions:
# ENABLED-NEXT: 1 5 1.00 vmulps %xmm0, %xmm1, %xmm2
# ENABLED-NEXT: 3 5 2.00 vhaddps %xmm2, %xmm2, %xmm3
# ENABLED-NEXT: 3 5 2.00 vhaddps %xmm3, %xmm3, %xmm4
93 changes: 93 additions & 0 deletions llvm/test/tools/llvm-mca/X86/BdVer2/load-store-alias.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -iterations=100 -timeline -timeline-max-iterations=1 -noalias=false < %s | FileCheck %s

vmovaps (%rsi), %xmm0
vmovaps %xmm0, (%rdi)
vmovaps 16(%rsi), %xmm0
vmovaps %xmm0, 16(%rdi)
vmovaps 32(%rsi), %xmm0
vmovaps %xmm0, 32(%rdi)
vmovaps 48(%rsi), %xmm0
vmovaps %xmm0, 48(%rdi)

# CHECK: Iterations: 100
# CHECK-NEXT: Instructions: 800
# CHECK-NEXT: Total Cycles: 2803
# CHECK-NEXT: Total uOps: 800

# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.29
# CHECK-NEXT: IPC: 0.29
# CHECK-NEXT: Block RThroughput: 4.0

# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)

# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 6 0.50 * vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 1 6 0.50 * vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 1 6 0.50 * vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 1 6 0.50 * vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 1 1 1.00 * vmovaps %xmm0, 48(%rdi)

# CHECK: Resources:
# CHECK-NEXT: [0] - SBDivider
# CHECK-NEXT: [1] - SBFPDivider
# CHECK-NEXT: [2] - SBPort0
# CHECK-NEXT: [3] - SBPort1
# CHECK-NEXT: [4] - SBPort4
# CHECK-NEXT: [5] - SBPort5
# CHECK-NEXT: [6.0] - SBPort23
# CHECK-NEXT: [6.1] - SBPort23

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1]
# CHECK-NEXT: - - - - 4.00 - - 8.00

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6.0] [6.1] Instructions:
# CHECK-NEXT: - - - - - - - 1.00 vmovaps (%rsi), %xmm0
# CHECK-NEXT: - - - - 1.00 - - 1.00 vmovaps %xmm0, (%rdi)
# CHECK-NEXT: - - - - - - - 1.00 vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: - - - - 1.00 - - 1.00 vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: - - - - - - - 1.00 vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: - - - - 1.00 - - 1.00 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: - - - - - - - 1.00 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: - - - - 1.00 - - 1.00 vmovaps %xmm0, 48(%rdi)

# CHECK: Timeline view:
# CHECK-NEXT: 0123456789 0
# CHECK-NEXT: Index 0123456789 0123456789

# CHECK: [0,0] DeeeeeeER . . . . . vmovaps (%rsi), %xmm0
# CHECK-NEXT: [0,1] D======eER. . . . . vmovaps %xmm0, (%rdi)
# CHECK-NEXT: [0,2] D=======eeeeeeER . . . vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: [0,3] D=============eER . . . vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: [0,4] .D=============eeeeeeER . . vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: [0,5] .D===================eER . . vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: [0,6] .D====================eeeeeeER. vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: [0,7] .D==========================eER vmovaps %xmm0, 48(%rdi)

# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage

# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 vmovaps (%rsi), %xmm0
# CHECK-NEXT: 1. 1 7.0 0.0 0.0 vmovaps %xmm0, (%rdi)
# CHECK-NEXT: 2. 1 8.0 0.0 0.0 vmovaps 16(%rsi), %xmm0
# CHECK-NEXT: 3. 1 14.0 0.0 0.0 vmovaps %xmm0, 16(%rdi)
# CHECK-NEXT: 4. 1 14.0 0.0 0.0 vmovaps 32(%rsi), %xmm0
# CHECK-NEXT: 5. 1 20.0 0.0 0.0 vmovaps %xmm0, 32(%rdi)
# CHECK-NEXT: 6. 1 21.0 0.0 0.0 vmovaps 48(%rsi), %xmm0
# CHECK-NEXT: 7. 1 27.0 0.0 0.0 vmovaps %xmm0, 48(%rdi)
Loading