diff --git a/llvm/test/CodeGen/X86/fast-isel-store.ll b/llvm/test/CodeGen/X86/fast-isel-store.ll index 8472498ba41f9..eba538d213392 100644 --- a/llvm/test/CodeGen/X86/fast-isel-store.ll +++ b/llvm/test/CodeGen/X86/fast-isel-store.ll @@ -1,771 +1,747 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL32,SSE32 -; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=ALL64,SSE64 -; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVXONLY32 -; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVXONLY64 -; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232 -; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264 -; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL32,AVX32,AVX51232 -; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=ALL64,AVX64,AVX51264 +; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X86,X86-SSE +; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=X64,X64-SSE +; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX1 +; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1 +; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512 +; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 +; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X86,X86-AVX,X86-AVX512 +; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512 -define i32 @test_store_32(ptr nocapture %addr, i32 %value) { -; ALL32-LABEL: test_store_32: -; ALL32: # %bb.0: # %entry -; ALL32-NEXT: movl %esi, %eax -; ALL32-NEXT: movl %esi, (%rdi) -; ALL32-NEXT: retq -; -; ALL64-LABEL: test_store_32: -; ALL64: # %bb.0: # %entry -; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax -; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ALL64-NEXT: movl %eax, (%ecx) -; ALL64-NEXT: retl +define i32 @test_store_32(ptr nocapture %addr, i32 %value) nounwind { +; X86-LABEL: test_store_32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %eax, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: test_store_32: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movl %esi, (%rdi) +; X64-NEXT: retq entry: store i32 %value, ptr %addr, align 1 ret i32 %value } -define i16 @test_store_16(ptr nocapture %addr, i16 %value) { -; ALL32-LABEL: test_store_16: -; ALL32: # %bb.0: # %entry -; ALL32-NEXT: movl %esi, %eax -; ALL32-NEXT: movw %ax, (%rdi) -; ALL32-NEXT: # kill: def $ax killed $ax killed $eax -; ALL32-NEXT: retq -; -; ALL64-LABEL: test_store_16: -; ALL64: # %bb.0: # %entry -; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx -; ALL64-NEXT: movw %ax, (%ecx) -; ALL64-NEXT: retl +define i16 @test_store_16(ptr nocapture %addr, i16 %value) nounwind { +; X86-LABEL: test_store_16: +; X86: # %bb.0: # %entry +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %ax, (%ecx) +; X86-NEXT: retl +; +; X64-LABEL: test_store_16: +; X64: # %bb.0: # %entry +; X64-NEXT: movl %esi, %eax +; X64-NEXT: movw %ax, (%rdi) +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq entry: store i16 %value, ptr %addr, align 1 ret i16 %value } -define <4 x i32> @test_store_4xi32(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { -; SSE32-LABEL: test_store_4xi32: -; SSE32: # %bb.0: -; SSE32-NEXT: paddd %xmm1, %xmm0 -; SSE32-NEXT: movdqu %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xi32: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: paddd %xmm1, %xmm0 -; SSE64-NEXT: movdqu %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xi32: -; AVX32: # %bb.0: -; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovdqu %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xi32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX64-NEXT: vmovdqu %xmm0, (%eax) -; AVX64-NEXT: retl +define <4 x i32> @test_store_4xi32(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind { +; X86-SSE-LABEL: test_store_4xi32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: paddd %xmm1, %xmm0 +; X86-SSE-NEXT: movdqu %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xi32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: movdqu %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xi32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqu %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xi32: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqu %xmm0, (%rdi) +; X64-AVX-NEXT: retq %foo = add <4 x i32> %value, %value2 ; to force integer type on store store <4 x i32> %foo, ptr %addr, align 1 ret <4 x i32> %foo } -define <4 x i32> @test_store_4xi32_aligned(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { -; SSE32-LABEL: test_store_4xi32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: paddd %xmm1, %xmm0 -; SSE32-NEXT: movdqa %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xi32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: paddd %xmm1, %xmm0 -; SSE64-NEXT: movdqa %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xi32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovdqa %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xi32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX64-NEXT: vmovdqa %xmm0, (%eax) -; AVX64-NEXT: retl +define <4 x i32> @test_store_4xi32_aligned(ptr nocapture %addr, <4 x i32> %value, <4 x i32> %value2) nounwind { +; X86-SSE-LABEL: test_store_4xi32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: paddd %xmm1, %xmm0 +; X86-SSE-NEXT: movdqa %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xi32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: paddd %xmm1, %xmm0 +; X64-SSE-NEXT: movdqa %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xi32_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovdqa %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xi32_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovdqa %xmm0, (%rdi) +; X64-AVX-NEXT: retq %foo = add <4 x i32> %value, %value2 ; to force integer type on store store <4 x i32> %foo, ptr %addr, align 16 ret <4 x i32> %foo } -define <4 x float> @test_store_4xf32(ptr nocapture %addr, <4 x float> %value) { -; SSE32-LABEL: test_store_4xf32: -; SSE32: # %bb.0: -; SSE32-NEXT: movups %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xf32: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movups %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xf32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xf32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %xmm0, (%eax) -; AVX64-NEXT: retl +define <4 x float> @test_store_4xf32(ptr nocapture %addr, <4 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_4xf32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xf32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xf32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovups %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xf32: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups %xmm0, (%rdi) +; X64-AVX-NEXT: retq store <4 x float> %value, ptr %addr, align 1 ret <4 x float> %value } -define <4 x float> @test_store_4xf32_aligned(ptr nocapture %addr, <4 x float> %value) { -; SSE32-LABEL: test_store_4xf32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: movaps %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xf32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movaps %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xf32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xf32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %xmm0, (%eax) -; AVX64-NEXT: retl +define <4 x float> @test_store_4xf32_aligned(ptr nocapture %addr, <4 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_4xf32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xf32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xf32_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xf32_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) +; X64-AVX-NEXT: retq store <4 x float> %value, ptr %addr, align 16 ret <4 x float> %value } -define <2 x double> @test_store_2xf64(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) { -; SSE32-LABEL: test_store_2xf64: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm1, %xmm0 -; SSE32-NEXT: movupd %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_2xf64: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd %xmm1, %xmm0 -; SSE64-NEXT: movupd %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_2xf64: -; AVX32: # %bb.0: -; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovupd %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_2xf64: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX64-NEXT: vmovupd %xmm0, (%eax) -; AVX64-NEXT: retl +define <2 x double> @test_store_2xf64(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_2xf64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd %xmm1, %xmm0 +; X86-SSE-NEXT: movupd %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_2xf64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm1, %xmm0 +; X64-SSE-NEXT: movupd %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_2xf64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovupd %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_2xf64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovupd %xmm0, (%rdi) +; X64-AVX-NEXT: retq %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store store <2 x double> %foo, ptr %addr, align 1 ret <2 x double> %foo } -define <2 x double> @test_store_2xf64_aligned(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) { -; SSE32-LABEL: test_store_2xf64_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm1, %xmm0 -; SSE32-NEXT: movapd %xmm0, (%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_2xf64_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd %xmm1, %xmm0 -; SSE64-NEXT: movapd %xmm0, (%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_2xf64_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX32-NEXT: vmovapd %xmm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_2xf64_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; AVX64-NEXT: vmovapd %xmm0, (%eax) -; AVX64-NEXT: retl +define <2 x double> @test_store_2xf64_aligned(ptr nocapture %addr, <2 x double> %value, <2 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_2xf64_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd %xmm1, %xmm0 +; X86-SSE-NEXT: movapd %xmm0, (%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_2xf64_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm1, %xmm0 +; X64-SSE-NEXT: movapd %xmm0, (%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_2xf64_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmovapd %xmm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_2xf64_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vmovapd %xmm0, (%rdi) +; X64-AVX-NEXT: retq %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store store <2 x double> %foo, ptr %addr, align 16 ret <2 x double> %foo } -define <8 x i32> @test_store_8xi32(ptr nocapture %addr, <8 x i32> %value) { -; SSE32-LABEL: test_store_8xi32: -; SSE32: # %bb.0: -; SSE32-NEXT: movups %xmm0, (%rdi) -; SSE32-NEXT: movups %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xi32: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movups %xmm0, (%eax) -; SSE64-NEXT: movups %xmm1, 16(%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_8xi32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_8xi32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %ymm0, (%eax) -; AVX64-NEXT: retl +define <8 x i32> @test_store_8xi32(ptr nocapture %addr, <8 x i32> %value) nounwind { +; X86-SSE-LABEL: test_store_8xi32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: movups %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xi32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: movups %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_8xi32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovups %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_8xi32: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups %ymm0, (%rdi) +; X64-AVX-NEXT: retq store <8 x i32> %value, ptr %addr, align 1 ret <8 x i32> %value } -define <8 x i32> @test_store_8xi32_aligned(ptr nocapture %addr, <8 x i32> %value) { -; SSE32-LABEL: test_store_8xi32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: movaps %xmm0, (%rdi) -; SSE32-NEXT: movaps %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xi32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movaps %xmm0, (%eax) -; SSE64-NEXT: movaps %xmm1, 16(%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_8xi32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_8xi32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) -; AVX64-NEXT: retl +define <8 x i32> @test_store_8xi32_aligned(ptr nocapture %addr, <8 x i32> %value) nounwind { +; X86-SSE-LABEL: test_store_8xi32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: movaps %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xi32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: movaps %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_8xi32_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_8xi32_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX-NEXT: retq store <8 x i32> %value, ptr %addr, align 32 ret <8 x i32> %value } -define <8 x float> @test_store_8xf32(ptr nocapture %addr, <8 x float> %value) { -; SSE32-LABEL: test_store_8xf32: -; SSE32: # %bb.0: -; SSE32-NEXT: movups %xmm0, (%rdi) -; SSE32-NEXT: movups %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xf32: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movups %xmm0, (%eax) -; SSE64-NEXT: movups %xmm1, 16(%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_8xf32: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovups %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_8xf32: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovups %ymm0, (%eax) -; AVX64-NEXT: retl +define <8 x float> @test_store_8xf32(ptr nocapture %addr, <8 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_8xf32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: movups %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xf32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: movups %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_8xf32: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovups %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_8xf32: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovups %ymm0, (%rdi) +; X64-AVX-NEXT: retq store <8 x float> %value, ptr %addr, align 1 ret <8 x float> %value } -define <8 x float> @test_store_8xf32_aligned(ptr nocapture %addr, <8 x float> %value) { -; SSE32-LABEL: test_store_8xf32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: movaps %xmm0, (%rdi) -; SSE32-NEXT: movaps %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xf32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movaps %xmm0, (%eax) -; SSE64-NEXT: movaps %xmm1, 16(%eax) -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_8xf32_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vmovaps %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_8xf32_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vmovaps %ymm0, (%eax) -; AVX64-NEXT: retl +define <8 x float> @test_store_8xf32_aligned(ptr nocapture %addr, <8 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_8xf32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: movaps %xmm1, 16(%eax) +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xf32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: movaps %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_8xf32_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_8xf32_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX-NEXT: retq store <8 x float> %value, ptr %addr, align 32 ret <8 x float> %value } -define <4 x double> @test_store_4xf64(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) { -; SSE32-LABEL: test_store_4xf64: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm2, %xmm0 -; SSE32-NEXT: movupd %xmm0, (%rdi) -; SSE32-NEXT: addpd %xmm3, %xmm1 -; SSE32-NEXT: movupd %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xf64: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd %xmm2, %xmm0 -; SSE64-NEXT: movupd %xmm0, (%eax) -; SSE64-NEXT: addpd %xmm3, %xmm1 -; SSE64-NEXT: movupd %xmm1, 16(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xf64: -; AVX32: # %bb.0: -; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX32-NEXT: vmovupd %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xf64: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX64-NEXT: vmovupd %ymm0, (%eax) -; AVX64-NEXT: retl +define <4 x double> @test_store_4xf64(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_4xf64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd %xmm2, %xmm0 +; X86-SSE-NEXT: movupd %xmm0, (%eax) +; X86-SSE-NEXT: addpd %xmm3, %xmm1 +; X86-SSE-NEXT: movupd %xmm1, 16(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xf64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm2, %xmm0 +; X64-SSE-NEXT: movupd %xmm0, (%rdi) +; X64-SSE-NEXT: addpd %xmm3, %xmm1 +; X64-SSE-NEXT: movupd %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xf64: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X86-AVX-NEXT: vmovupd %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xf64: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X64-AVX-NEXT: vmovupd %ymm0, (%rdi) +; X64-AVX-NEXT: retq %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store store <4 x double> %foo, ptr %addr, align 1 ret <4 x double> %foo } -define <4 x double> @test_store_4xf64_aligned(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) { -; SSE32-LABEL: test_store_4xf64_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm2, %xmm0 -; SSE32-NEXT: movapd %xmm0, (%rdi) -; SSE32-NEXT: addpd %xmm3, %xmm1 -; SSE32-NEXT: movapd %xmm1, 16(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_4xf64_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd %xmm2, %xmm0 -; SSE64-NEXT: movapd %xmm0, (%eax) -; SSE64-NEXT: addpd %xmm3, %xmm1 -; SSE64-NEXT: movapd %xmm1, 16(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVX32-LABEL: test_store_4xf64_aligned: -; AVX32: # %bb.0: -; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX32-NEXT: vmovapd %ymm0, (%rdi) -; AVX32-NEXT: retq -; -; AVX64-LABEL: test_store_4xf64_aligned: -; AVX64: # %bb.0: -; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX64-NEXT: vmovapd %ymm0, (%eax) -; AVX64-NEXT: retl +define <4 x double> @test_store_4xf64_aligned(ptr nocapture %addr, <4 x double> %value, <4 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_4xf64_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd %xmm2, %xmm0 +; X86-SSE-NEXT: movapd %xmm0, (%eax) +; X86-SSE-NEXT: addpd %xmm3, %xmm1 +; X86-SSE-NEXT: movapd %xmm1, 16(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_4xf64_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm2, %xmm0 +; X64-SSE-NEXT: movapd %xmm0, (%rdi) +; X64-SSE-NEXT: addpd %xmm3, %xmm1 +; X64-SSE-NEXT: movapd %xmm1, 16(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX-LABEL: test_store_4xf64_aligned: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X86-AVX-NEXT: vmovapd %ymm0, (%eax) +; X86-AVX-NEXT: retl +; +; X64-AVX-LABEL: test_store_4xf64_aligned: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; X64-AVX-NEXT: vmovapd %ymm0, (%rdi) +; X64-AVX-NEXT: retq %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store store <4 x double> %foo, ptr %addr, align 32 ret <4 x double> %foo } -define <16 x i32> @test_store_16xi32(ptr nocapture %addr, <16 x i32> %value) { -; SSE32-LABEL: test_store_16xi32: -; SSE32: # %bb.0: -; SSE32-NEXT: movups %xmm0, (%rdi) -; SSE32-NEXT: movups %xmm1, 16(%rdi) -; SSE32-NEXT: movups %xmm2, 32(%rdi) -; SSE32-NEXT: movups %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_16xi32: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movups %xmm0, (%eax) -; SSE64-NEXT: movups %xmm1, 16(%eax) -; SSE64-NEXT: movups %xmm2, 32(%eax) -; SSE64-NEXT: movups %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_16xi32: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_16xi32: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovups %ymm0, (%eax) -; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_16xi32: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovups %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_16xi32: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovups %zmm0, (%eax) -; AVX51264-NEXT: retl +define <16 x i32> @test_store_16xi32(ptr nocapture %addr, <16 x i32> %value) nounwind { +; X86-SSE-LABEL: test_store_16xi32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: movups %xmm1, 16(%eax) +; X86-SSE-NEXT: movups %xmm2, 32(%eax) +; X86-SSE-NEXT: movups %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_16xi32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: movups %xmm1, 16(%rdi) +; X64-SSE-NEXT: movups %xmm2, 32(%rdi) +; X64-SSE-NEXT: movups %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_16xi32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_16xi32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups %ymm0, (%rdi) +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_16xi32: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_16xi32: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups %zmm0, (%rdi) +; X64-AVX512-NEXT: retq store <16 x i32> %value, ptr %addr, align 1 ret <16 x i32> %value } -define <16 x i32> @test_store_16xi32_aligned(ptr nocapture %addr, <16 x i32> %value) { -; SSE32-LABEL: test_store_16xi32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: movaps %xmm0, (%rdi) -; SSE32-NEXT: movaps %xmm1, 16(%rdi) -; SSE32-NEXT: movaps %xmm2, 32(%rdi) -; SSE32-NEXT: movaps %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_16xi32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movaps %xmm0, (%eax) -; SSE64-NEXT: movaps %xmm1, 16(%eax) -; SSE64-NEXT: movaps %xmm2, 32(%eax) -; SSE64-NEXT: movaps %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_16xi32_aligned: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_16xi32_aligned: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) -; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_16xi32_aligned: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_16xi32_aligned: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) -; AVX51264-NEXT: retl +define <16 x i32> @test_store_16xi32_aligned(ptr nocapture %addr, <16 x i32> %value) nounwind { +; X86-SSE-LABEL: test_store_16xi32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: movaps %xmm1, 16(%eax) +; X86-SSE-NEXT: movaps %xmm2, 32(%eax) +; X86-SSE-NEXT: movaps %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_16xi32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: movaps %xmm1, 16(%rdi) +; X64-SSE-NEXT: movaps %xmm2, 32(%rdi) +; X64-SSE-NEXT: movaps %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_16xi32_aligned: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm1, 32(%eax) +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_16xi32_aligned: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX1-NEXT: vmovaps %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_16xi32_aligned: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_16xi32_aligned: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovaps %zmm0, (%rdi) +; X64-AVX512-NEXT: retq store <16 x i32> %value, ptr %addr, align 64 ret <16 x i32> %value } -define <16 x float> @test_store_16xf32(ptr nocapture %addr, <16 x float> %value) { -; SSE32-LABEL: test_store_16xf32: -; SSE32: # %bb.0: -; SSE32-NEXT: movups %xmm0, (%rdi) -; SSE32-NEXT: movups %xmm1, 16(%rdi) -; SSE32-NEXT: movups %xmm2, 32(%rdi) -; SSE32-NEXT: movups %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_16xf32: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movups %xmm0, (%eax) -; SSE64-NEXT: movups %xmm1, 16(%eax) -; SSE64-NEXT: movups %xmm2, 32(%eax) -; SSE64-NEXT: movups %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_16xf32: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_16xf32: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovups %ymm0, (%eax) -; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_16xf32: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovups %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_16xf32: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovups %zmm0, (%eax) -; AVX51264-NEXT: retl +define <16 x float> @test_store_16xf32(ptr nocapture %addr, <16 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_16xf32: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movups %xmm0, (%eax) +; X86-SSE-NEXT: movups %xmm1, 16(%eax) +; X86-SSE-NEXT: movups %xmm2, 32(%eax) +; X86-SSE-NEXT: movups %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_16xf32: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movups %xmm0, (%rdi) +; X64-SSE-NEXT: movups %xmm1, 16(%rdi) +; X64-SSE-NEXT: movups %xmm2, 32(%rdi) +; X64-SSE-NEXT: movups %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_16xf32: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovups %ymm0, (%eax) +; X86-AVX1-NEXT: vmovups %ymm1, 32(%eax) +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_16xf32: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovups %ymm0, (%rdi) +; X64-AVX1-NEXT: vmovups %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_16xf32: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovups %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_16xf32: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovups %zmm0, (%rdi) +; X64-AVX512-NEXT: retq store <16 x float> %value, ptr %addr, align 1 ret <16 x float> %value } -define <16 x float> @test_store_16xf32_aligned(ptr nocapture %addr, <16 x float> %value) { -; SSE32-LABEL: test_store_16xf32_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: movaps %xmm0, (%rdi) -; SSE32-NEXT: movaps %xmm1, 16(%rdi) -; SSE32-NEXT: movaps %xmm2, 32(%rdi) -; SSE32-NEXT: movaps %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_16xf32_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: movaps %xmm0, (%eax) -; SSE64-NEXT: movaps %xmm1, 16(%eax) -; SSE64-NEXT: movaps %xmm2, 32(%eax) -; SSE64-NEXT: movaps %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_16xf32_aligned: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) -; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_16xf32_aligned: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) -; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_16xf32_aligned: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vmovaps %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_16xf32_aligned: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vmovaps %zmm0, (%eax) -; AVX51264-NEXT: retl +define <16 x float> @test_store_16xf32_aligned(ptr nocapture %addr, <16 x float> %value) nounwind { +; X86-SSE-LABEL: test_store_16xf32_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: movaps %xmm0, (%eax) +; X86-SSE-NEXT: movaps %xmm1, 16(%eax) +; X86-SSE-NEXT: movaps %xmm2, 32(%eax) +; X86-SSE-NEXT: movaps %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_16xf32_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: movaps %xmm0, (%rdi) +; X64-SSE-NEXT: movaps %xmm1, 16(%rdi) +; X64-SSE-NEXT: movaps %xmm2, 32(%rdi) +; X64-SSE-NEXT: movaps %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_16xf32_aligned: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) +; X86-AVX1-NEXT: vmovaps %ymm1, 32(%eax) +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_16xf32_aligned: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovaps %ymm0, (%rdi) +; X64-AVX1-NEXT: vmovaps %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_16xf32_aligned: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vmovaps %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_16xf32_aligned: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vmovaps %zmm0, (%rdi) +; X64-AVX512-NEXT: retq store <16 x float> %value, ptr %addr, align 64 ret <16 x float> %value } -define <8 x double> @test_store_8xf64(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) { -; SSE32-LABEL: test_store_8xf64: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm4, %xmm0 -; SSE32-NEXT: movupd %xmm0, (%rdi) -; SSE32-NEXT: addpd %xmm5, %xmm1 -; SSE32-NEXT: movupd %xmm1, 16(%rdi) -; SSE32-NEXT: addpd %xmm6, %xmm2 -; SSE32-NEXT: movupd %xmm2, 32(%rdi) -; SSE32-NEXT: addpd %xmm7, %xmm3 -; SSE32-NEXT: movupd %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xf64: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: addpd %xmm4, %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 -; SSE64-NEXT: movupd %xmm0, (%eax) -; SSE64-NEXT: addpd %xmm6, %xmm1 -; SSE64-NEXT: movupd %xmm1, 16(%eax) -; SSE64-NEXT: addpd %xmm5, %xmm2 -; SSE64-NEXT: movupd %xmm2, 32(%eax) -; SSE64-NEXT: movupd %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_8xf64: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi) -; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_8xf64: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: pushl %ebp -; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 -; AVXONLY64-NEXT: .cfi_offset %ebp, -8 -; AVXONLY64-NEXT: movl %esp, %ebp -; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp -; AVXONLY64-NEXT: andl $-32, %esp -; AVXONLY64-NEXT: subl $32, %esp -; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3 -; AVXONLY64-NEXT: movl 8(%ebp), %eax -; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVXONLY64-NEXT: vmovupd %ymm0, (%eax) -; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) -; AVXONLY64-NEXT: movl %ebp, %esp -; AVXONLY64-NEXT: popl %ebp -; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_8xf64: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX51232-NEXT: vmovupd %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_8xf64: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX51264-NEXT: vmovupd %zmm0, (%eax) -; AVX51264-NEXT: retl +define <8 x double> @test_store_8xf64(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_8xf64: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: addpd %xmm4, %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movupd %xmm0, (%eax) +; X86-SSE-NEXT: addpd %xmm6, %xmm1 +; X86-SSE-NEXT: movupd %xmm1, 16(%eax) +; X86-SSE-NEXT: addpd %xmm5, %xmm2 +; X86-SSE-NEXT: movupd %xmm2, 32(%eax) +; X86-SSE-NEXT: movupd %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xf64: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm4, %xmm0 +; X64-SSE-NEXT: movupd %xmm0, (%rdi) +; X64-SSE-NEXT: addpd %xmm5, %xmm1 +; X64-SSE-NEXT: movupd %xmm1, 16(%rdi) +; X64-SSE-NEXT: addpd %xmm6, %xmm2 +; X64-SSE-NEXT: movupd %xmm2, 32(%rdi) +; X64-SSE-NEXT: addpd %xmm7, %xmm3 +; X64-SSE-NEXT: movupd %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_8xf64: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vmovapd 40(%ebp), %ymm3 +; X86-AVX1-NEXT: movl 8(%ebp), %eax +; X86-AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovupd %ymm0, (%eax) +; X86-AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vmovupd %ymm1, 32(%eax) +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_8xf64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovupd %ymm0, (%rdi) +; X64-AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovupd %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_8xf64: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; X86-AVX512-NEXT: vmovupd %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_8xf64: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovupd %zmm0, (%rdi) +; X64-AVX512-NEXT: retq %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store store <8 x double> %foo, ptr %addr, align 1 ret <8 x double> %foo } -define <8 x double> @test_store_8xf64_aligned(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) { -; SSE32-LABEL: test_store_8xf64_aligned: -; SSE32: # %bb.0: -; SSE32-NEXT: addpd %xmm4, %xmm0 -; SSE32-NEXT: movapd %xmm0, (%rdi) -; SSE32-NEXT: addpd %xmm5, %xmm1 -; SSE32-NEXT: movapd %xmm1, 16(%rdi) -; SSE32-NEXT: addpd %xmm6, %xmm2 -; SSE32-NEXT: movapd %xmm2, 32(%rdi) -; SSE32-NEXT: addpd %xmm7, %xmm3 -; SSE32-NEXT: movapd %xmm3, 48(%rdi) -; SSE32-NEXT: retq -; -; SSE64-LABEL: test_store_8xf64_aligned: -; SSE64: # %bb.0: -; SSE64-NEXT: subl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 16 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 -; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 -; SSE64-NEXT: addpd %xmm4, %xmm3 -; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 -; SSE64-NEXT: movapd %xmm0, (%eax) -; SSE64-NEXT: addpd %xmm6, %xmm1 -; SSE64-NEXT: movapd %xmm1, 16(%eax) -; SSE64-NEXT: addpd %xmm5, %xmm2 -; SSE64-NEXT: movapd %xmm2, 32(%eax) -; SSE64-NEXT: movapd %xmm3, 48(%eax) -; SSE64-NEXT: addl $12, %esp -; SSE64-NEXT: .cfi_def_cfa_offset 4 -; SSE64-NEXT: retl -; -; AVXONLY32-LABEL: test_store_8xf64_aligned: -; AVXONLY32: # %bb.0: -; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi) -; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi) -; AVXONLY32-NEXT: retq -; -; AVXONLY64-LABEL: test_store_8xf64_aligned: -; AVXONLY64: # %bb.0: -; AVXONLY64-NEXT: pushl %ebp -; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 -; AVXONLY64-NEXT: .cfi_offset %ebp, -8 -; AVXONLY64-NEXT: movl %esp, %ebp -; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp -; AVXONLY64-NEXT: andl $-32, %esp -; AVXONLY64-NEXT: subl $32, %esp -; AVXONLY64-NEXT: vmovapd 40(%ebp), %ymm3 -; AVXONLY64-NEXT: movl 8(%ebp), %eax -; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 -; AVXONLY64-NEXT: vmovapd %ymm0, (%eax) -; AVXONLY64-NEXT: vaddpd %ymm3, %ymm1, %ymm1 -; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) -; AVXONLY64-NEXT: movl %ebp, %esp -; AVXONLY64-NEXT: popl %ebp -; AVXONLY64-NEXT: .cfi_def_cfa %esp, 4 -; AVXONLY64-NEXT: retl -; -; AVX51232-LABEL: test_store_8xf64_aligned: -; AVX51232: # %bb.0: -; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX51232-NEXT: vmovapd %zmm0, (%rdi) -; AVX51232-NEXT: retq -; -; AVX51264-LABEL: test_store_8xf64_aligned: -; AVX51264: # %bb.0: -; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 -; AVX51264-NEXT: vmovapd %zmm0, (%eax) -; AVX51264-NEXT: retl +define <8 x double> @test_store_8xf64_aligned(ptr nocapture %addr, <8 x double> %value, <8 x double> %value2) nounwind { +; X86-SSE-LABEL: test_store_8xf64_aligned: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: subl $12, %esp +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm4 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm5 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm6 +; X86-SSE-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 +; X86-SSE-NEXT: addpd %xmm4, %xmm3 +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 +; X86-SSE-NEXT: movapd %xmm0, (%eax) +; X86-SSE-NEXT: addpd %xmm6, %xmm1 +; X86-SSE-NEXT: movapd %xmm1, 16(%eax) +; X86-SSE-NEXT: addpd %xmm5, %xmm2 +; X86-SSE-NEXT: movapd %xmm2, 32(%eax) +; X86-SSE-NEXT: movapd %xmm3, 48(%eax) +; X86-SSE-NEXT: addl $12, %esp +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: test_store_8xf64_aligned: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: addpd %xmm4, %xmm0 +; X64-SSE-NEXT: movapd %xmm0, (%rdi) +; X64-SSE-NEXT: addpd %xmm5, %xmm1 +; X64-SSE-NEXT: movapd %xmm1, 16(%rdi) +; X64-SSE-NEXT: addpd %xmm6, %xmm2 +; X64-SSE-NEXT: movapd %xmm2, 32(%rdi) +; X64-SSE-NEXT: addpd %xmm7, %xmm3 +; X64-SSE-NEXT: movapd %xmm3, 48(%rdi) +; X64-SSE-NEXT: retq +; +; X86-AVX1-LABEL: test_store_8xf64_aligned: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vmovapd 40(%ebp), %ymm3 +; X86-AVX1-NEXT: movl 8(%ebp), %eax +; X86-AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; X86-AVX1-NEXT: vmovapd %ymm0, (%eax) +; X86-AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X86-AVX1-NEXT: vmovapd %ymm1, 32(%eax) +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: test_store_8xf64_aligned: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0 +; X64-AVX1-NEXT: vmovapd %ymm0, (%rdi) +; X64-AVX1-NEXT: vaddpd %ymm3, %ymm1, %ymm1 +; X64-AVX1-NEXT: vmovapd %ymm1, 32(%rdi) +; X64-AVX1-NEXT: retq +; +; X86-AVX512-LABEL: test_store_8xf64_aligned: +; X86-AVX512: # %bb.0: +; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; X86-AVX512-NEXT: vmovapd %zmm0, (%eax) +; X86-AVX512-NEXT: retl +; +; X64-AVX512-LABEL: test_store_8xf64_aligned: +; X64-AVX512: # %bb.0: +; X64-AVX512-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; X64-AVX512-NEXT: vmovapd %zmm0, (%rdi) +; X64-AVX512-NEXT: retq %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store store <8 x double> %foo, ptr %addr, align 64 ret <8 x double> %foo