Skip to content

Commit

Permalink
[X86] Expose memory codegen in element insert load tests to improve a…
Browse files Browse the repository at this point in the history
…ccuracy of checks

Also replace X32 with X86 check prefixes for i686 tests (we tend to try to use X32 for gnux32 targets)
  • Loading branch information
RKSimon committed Aug 22, 2021
1 parent a1c892b commit 7b7ac4b
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 57 deletions.
104 changes: 48 additions & 56 deletions llvm/test/CodeGen/X86/avx.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64

define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
Expand Down Expand Up @@ -43,16 +43,15 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone

define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
; On X32, account for the argument's move to registers
; X32-LABEL: insertps_from_vector_load:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
; X86-LABEL: insertps_from_vector_load:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load:
; X64: ## %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
Expand All @@ -61,38 +60,34 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap

;; Use a non-zero CountS for insertps
define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
; On X32, account for the argument's move to registers
;; Try to match a bit more of the instr, since we need the load's offset.
; X32-LABEL: insertps_from_vector_load_offset:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X32-NEXT: retl
; X86-LABEL: insertps_from_vector_load_offset:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset:
; X64: ## %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X64-NEXT: vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %pb, align 16
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
ret <4 x float> %2
}

define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
; On X32, account for the argument's move to registers
;; Try to match a bit more of the instr, since we need the load's offset.
; X32-LABEL: insertps_from_vector_load_offset_2:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: shll $4, %ecx
; X32-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: retl
; X86-LABEL: insertps_from_vector_load_offset_2:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: shll $4, %ecx
; X86-NEXT: vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_vector_load_offset_2:
; X64: ## %bb.0:
; X64-NEXT: shlq $4, %rsi
; X64-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X64-NEXT: vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
; X64-NEXT: retq
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
%2 = load <4 x float>, <4 x float>* %1, align 16
Expand All @@ -101,17 +96,16 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
}

define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
; On X32, account for the arguments' move to registers
; X32-LABEL: insertps_from_broadcast_loadf32:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
; X86-LABEL: insertps_from_broadcast_loadf32:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadf32:
; X64: ## %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
Expand All @@ -124,16 +118,15 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
}

define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
; On X32, account for the arguments' move to registers
; X32-LABEL: insertps_from_broadcast_loadv4f32:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X32-NEXT: retl
; X86-LABEL: insertps_from_broadcast_loadv4f32:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_loadv4f32:
; X64: ## %bb.0:
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
; X64-NEXT: retq
%1 = load <4 x float>, <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0
Expand All @@ -147,20 +140,19 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float

;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
; On X32, account for the arguments' move to registers
; X32-LABEL: insertps_from_broadcast_multiple_use:
; X32: ## %bb.0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X32-NEXT: retl
; X86-LABEL: insertps_from_broadcast_multiple_use:
; X86: ## %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: insertps_from_broadcast_multiple_use:
; X64: ## %bb.0:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/sse41.ll
@@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512
Expand Down

0 comments on commit 7b7ac4b

Please sign in to comment.