diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll index 2a7ed3a8b4e71..138d60b05ba9b 100644 --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X32-SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=X64,X64-SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -268,7 +268,7 @@ entry: } ; Test for bad extractions from a VBROADCAST_LOAD of the <2 x i16> non-uniform constant bitcast as <4 x i32>. -define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* nocapture %1, i16* nocapture %2) { +define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* nocapture %1, i16* nocapture %2) nounwind { ; X32-SSE2-LABEL: subextract_broadcast_load_constant: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -301,7 +301,7 @@ define void @subextract_broadcast_load_constant(<2 x i16>* nocapture %0, i16* no ret void } -define i32 @multi_use_load_scalarization(<4 x i32>* %p) { +define i32 @multi_use_load_scalarization(<4 x i32>* %p) nounwind { ; X32-SSE2-LABEL: multi_use_load_scalarization: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -335,3 +335,146 @@ define i32 @multi_use_load_scalarization(<4 x i32>* %p) { %r = extractelement <4 x i32> %v, i64 0 ret i32 %r } + +@n1 = local_unnamed_addr global <8 x i32> , align 32 +@zero = internal unnamed_addr global <8 x i32> zeroinitializer, align 32 + +define i32 @main() nounwind { +; X32-SSE2-LABEL: main: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %ebp +; X32-SSE2-NEXT: movl %esp, %ebp +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: andl $-32, %esp +; X32-SSE2-NEXT: subl $64, %esp +; X32-SSE2-NEXT: movdqa zero, %xmm0 +; X32-SSE2-NEXT: movaps n1+16, %xmm1 +; X32-SSE2-NEXT: movaps n1, %xmm2 +; X32-SSE2-NEXT: movaps %xmm2, zero +; X32-SSE2-NEXT: movaps %xmm1, zero+16 +; X32-SSE2-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] +; X32-SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) +; X32-SSE2-NEXT: movaps %xmm1, (%esp) +; X32-SSE2-NEXT: movdqa (%esp), %xmm1 +; X32-SSE2-NEXT: movaps {{[0-9]+}}(%esp), %xmm2 +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X32-SSE2-NEXT: movd %xmm2, %eax +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X32-SSE2-NEXT: movd %xmm2, %ecx +; X32-SSE2-NEXT: xorl %edx, %edx +; X32-SSE2-NEXT: divl %ecx +; X32-SSE2-NEXT: movl %eax, %ecx +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X32-SSE2-NEXT: movd %xmm0, %esi +; X32-SSE2-NEXT: xorl %edx, %edx +; X32-SSE2-NEXT: divl %esi +; X32-SSE2-NEXT: addl %ecx, %eax +; X32-SSE2-NEXT: leal -4(%ebp), %esp +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %ebp +; X32-SSE2-NEXT: retl +; +; X64-SSSE3-LABEL: main: +; X64-SSSE3: # %bb.0: +; X64-SSSE3-NEXT: pushq %rbp +; X64-SSSE3-NEXT: movq %rsp, %rbp +; X64-SSSE3-NEXT: andq $-32, %rsp +; X64-SSSE3-NEXT: subq $64, %rsp +; X64-SSSE3-NEXT: movdqa zero(%rip), %xmm0 +; X64-SSSE3-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-SSSE3-NEXT: movaps (%rax), %xmm1 +; X64-SSSE3-NEXT: movaps 16(%rax), %xmm2 +; X64-SSSE3-NEXT: movaps %xmm1, zero(%rip) +; X64-SSSE3-NEXT: movaps %xmm2, zero+16(%rip) +; X64-SSSE3-NEXT: movaps {{.*#+}} xmm1 = [2,2,2,2] +; X64-SSSE3-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) +; X64-SSSE3-NEXT: movaps %xmm1, (%rsp) +; X64-SSSE3-NEXT: movdqa (%rsp), %xmm1 +; X64-SSSE3-NEXT: movaps {{[0-9]+}}(%rsp), %xmm2 +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-SSSE3-NEXT: movd %xmm2, %eax +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] +; X64-SSSE3-NEXT: movd %xmm2, %ecx +; X64-SSSE3-NEXT: xorl %edx, %edx +; X64-SSSE3-NEXT: divl %ecx +; X64-SSSE3-NEXT: movl %eax, %ecx +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-SSSE3-NEXT: movd %xmm0, %eax +; X64-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] +; X64-SSSE3-NEXT: movd %xmm0, %esi +; X64-SSSE3-NEXT: xorl %edx, %edx +; X64-SSSE3-NEXT: divl %esi +; X64-SSSE3-NEXT: addl %ecx, %eax +; X64-SSSE3-NEXT: movq %rbp, %rsp +; X64-SSSE3-NEXT: popq %rbp +; X64-SSSE3-NEXT: retq +; +; X64-AVX1-LABEL: main: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: pushq %rbp +; X64-AVX1-NEXT: movq %rsp, %rbp +; X64-AVX1-NEXT: andq $-32, %rsp +; X64-AVX1-NEXT: subq $64, %rsp +; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX1-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX1-NEXT: vmovaps zero(%rip), %xmm1 +; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX1-NEXT: vextractps $2, %xmm1, %eax +; X64-AVX1-NEXT: vextractps $2, %xmm0, %ecx +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %ecx +; X64-AVX1-NEXT: movl %eax, %ecx +; X64-AVX1-NEXT: vextractps $1, %xmm1, %eax +; X64-AVX1-NEXT: vextractps $1, %xmm0, %esi +; X64-AVX1-NEXT: xorl %edx, %edx +; X64-AVX1-NEXT: divl %esi +; X64-AVX1-NEXT: addl %ecx, %eax +; X64-AVX1-NEXT: movq %rbp, %rsp +; X64-AVX1-NEXT: popq %rbp +; X64-AVX1-NEXT: vzeroupper +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: main: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: pushq %rbp +; X64-AVX2-NEXT: movq %rsp, %rbp +; X64-AVX2-NEXT: andq $-32, %rsp +; X64-AVX2-NEXT: subq $64, %rsp +; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX2-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX2-NEXT: vmovaps zero(%rip), %xmm1 +; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX2-NEXT: vextractps $2, %xmm1, %eax +; X64-AVX2-NEXT: vextractps $2, %xmm0, %ecx +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %ecx +; X64-AVX2-NEXT: movl %eax, %ecx +; X64-AVX2-NEXT: vextractps $1, %xmm1, %eax +; X64-AVX2-NEXT: vextractps $1, %xmm0, %esi +; X64-AVX2-NEXT: xorl %edx, %edx +; X64-AVX2-NEXT: divl %esi +; X64-AVX2-NEXT: addl %ecx, %eax +; X64-AVX2-NEXT: movq %rbp, %rsp +; X64-AVX2-NEXT: popq %rbp +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq + %stackptr = alloca <8 x i32>, align 32 + %z = load <8 x i32>, <8 x i32>* @zero, align 32 + %t1 = load <8 x i32>, <8 x i32>* @n1, align 32 + store <8 x i32> %t1, <8 x i32>* @zero, align 32 + store volatile <8 x i32> , <8 x i32>* %stackptr, align 32 + %stackload = load volatile <8 x i32>, <8 x i32>* %stackptr, align 32 + %div = udiv <8 x i32> %z, %stackload + %e1 = extractelement <8 x i32> %div, i64 1 + %e2 = extractelement <8 x i32> %div, i64 2 + %r = add i32 %e1, %e2 + ret i32 %r +}