Skip to content

Commit

Permalink
[X86] Add SSE2/AVX1/AVX512BW test coverage to interleaved load/store …
Browse files Browse the repository at this point in the history
…tests

Extension to PR51979 so codegen tests keep close to the costmodel tests
  • Loading branch information
RKSimon committed Oct 3, 2021
1 parent a76355d commit 31d0c8f
Show file tree
Hide file tree
Showing 20 changed files with 10,434 additions and 845 deletions.
373 changes: 339 additions & 34 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll

Large diffs are not rendered by default.

735 changes: 732 additions & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll

Large diffs are not rendered by default.

993 changes: 976 additions & 17 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll

Large diffs are not rendered by default.

819 changes: 816 additions & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll

Large diffs are not rendered by default.

1,072 changes: 1,069 additions & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll

Large diffs are not rendered by default.

309 changes: 289 additions & 20 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll

Large diffs are not rendered by default.

277 changes: 265 additions & 12 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-2.ll
@@ -1,20 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck --check-prefixes=AVX2 %s
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck --check-prefixes=AVX2 %s
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512

; These patterns are produced by LoopVectorizer for interleaved stores.

define void @load_i64_stride2_vf2(<4 x i64>* %in.vec, <2 x i64>* %out.vec0, <2 x i64>* %out.vec1) nounwind {
; AVX2-LABEL: load_i64_stride2_vf2:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %xmm0
; AVX2-NEXT: vmovaps 16(%rdi), %xmm1
; AVX2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX2-NEXT: vmovaps %xmm2, (%rsi)
; AVX2-NEXT: vmovaps %xmm0, (%rdx)
; AVX2-NEXT: retq
; SSE-LABEL: load_i64_stride2_vf2:
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: movaps %xmm2, (%rsi)
; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: retq
;
; AVX-LABEL: load_i64_stride2_vf2:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps (%rdi), %xmm0
; AVX-NEXT: vmovaps 16(%rdi), %xmm1
; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX-NEXT: vmovaps %xmm2, (%rsi)
; AVX-NEXT: vmovaps %xmm0, (%rdx)
; AVX-NEXT: retq
;
; AVX512-LABEL: load_i64_stride2_vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps (%rdi), %xmm0
; AVX512-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; AVX512-NEXT: vmovaps %xmm2, (%rsi)
; AVX512-NEXT: vmovaps %xmm0, (%rdx)
; AVX512-NEXT: retq
%wide.vec = load <4 x i64>, <4 x i64>* %in.vec, align 32

%strided.vec0 = shufflevector <4 x i64> %wide.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
Expand All @@ -27,6 +51,37 @@ define void @load_i64_stride2_vf2(<4 x i64>* %in.vec, <2 x i64>* %out.vec0, <2 x
}

define void @load_i64_stride2_vf4(<8 x i64>* %in.vec, <4 x i64>* %out.vec0, <4 x i64>* %out.vec1) nounwind {
; SSE-LABEL: load_i64_stride2_vf4:
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm0
; SSE-NEXT: movaps 16(%rdi), %xmm1
; SSE-NEXT: movaps 32(%rdi), %xmm2
; SSE-NEXT: movaps 48(%rdi), %xmm3
; SSE-NEXT: movaps %xmm2, %xmm4
; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; SSE-NEXT: movaps %xmm0, %xmm5
; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
; SSE-NEXT: movaps %xmm5, (%rsi)
; SSE-NEXT: movaps %xmm4, 16(%rsi)
; SSE-NEXT: movaps %xmm0, (%rdx)
; SSE-NEXT: movaps %xmm2, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride2_vf4:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX1-NEXT: vmovaps %ymm1, (%rsi)
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i64_stride2_vf4:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %ymm0
Expand All @@ -39,6 +94,19 @@ define void @load_i64_stride2_vf4(<8 x i64>* %in.vec, <4 x i64>* %out.vec0, <4 x
; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_i64_stride2_vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovaps (%rdi), %ymm0
; AVX512-NEXT: vmovaps 32(%rdi), %ymm1
; AVX512-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,3]
; AVX512-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: vmovaps %ymm2, (%rsi)
; AVX512-NEXT: vmovaps %ymm0, (%rdx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <8 x i64>, <8 x i64>* %in.vec, align 32

%strided.vec0 = shufflevector <8 x i64> %wide.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
Expand All @@ -51,6 +119,59 @@ define void @load_i64_stride2_vf4(<8 x i64>* %in.vec, <4 x i64>* %out.vec0, <4 x
}

define void @load_i64_stride2_vf8(<16 x i64>* %in.vec, <8 x i64>* %out.vec0, <8 x i64>* %out.vec1) nounwind {
; SSE-LABEL: load_i64_stride2_vf8:
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm6
; SSE-NEXT: movaps 16(%rdi), %xmm8
; SSE-NEXT: movaps 32(%rdi), %xmm4
; SSE-NEXT: movaps 48(%rdi), %xmm9
; SSE-NEXT: movaps 80(%rdi), %xmm10
; SSE-NEXT: movaps 64(%rdi), %xmm5
; SSE-NEXT: movaps 112(%rdi), %xmm11
; SSE-NEXT: movaps 96(%rdi), %xmm7
; SSE-NEXT: movaps %xmm7, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
; SSE-NEXT: movaps %xmm5, %xmm3
; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm10[0]
; SSE-NEXT: movaps %xmm4, %xmm2
; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm9[0]
; SSE-NEXT: movaps %xmm6, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm8[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm11[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm9[1]
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
; SSE-NEXT: movaps %xmm3, 32(%rsi)
; SSE-NEXT: movaps %xmm0, (%rsi)
; SSE-NEXT: movaps %xmm1, 48(%rsi)
; SSE-NEXT: movaps %xmm2, 16(%rsi)
; SSE-NEXT: movaps %xmm5, 32(%rdx)
; SSE-NEXT: movaps %xmm6, (%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
; SSE-NEXT: movaps %xmm4, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride2_vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
; AVX1-NEXT: vmovaps 64(%rdi), %ymm2
; AVX1-NEXT: vmovaps 96(%rdi), %ymm3
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3],ymm3[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3]
; AVX1-NEXT: vmovaps %ymm3, 32(%rsi)
; AVX1-NEXT: vmovaps %ymm1, (%rsi)
; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i64_stride2_vf8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %ymm0
Expand All @@ -71,6 +192,19 @@ define void @load_i64_stride2_vf8(<16 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
; AVX2-NEXT: vmovaps %ymm0, (%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_i64_stride2_vf8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,14]
; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,5,7,9,11,13,15]
; AVX512-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
; AVX512-NEXT: vmovdqu64 %zmm2, (%rsi)
; AVX512-NEXT: vmovdqu64 %zmm3, (%rdx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %in.vec, align 32

%strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
Expand All @@ -83,6 +217,105 @@ define void @load_i64_stride2_vf8(<16 x i64>* %in.vec, <8 x i64>* %out.vec0, <8
}

define void @load_i64_stride2_vf16(<32 x i64>* %in.vec, <16 x i64>* %out.vec0, <16 x i64>* %out.vec1) nounwind {
; SSE-LABEL: load_i64_stride2_vf16:
; SSE: # %bb.0:
; SSE-NEXT: movaps (%rdi), %xmm9
; SSE-NEXT: movaps 32(%rdi), %xmm14
; SSE-NEXT: movaps 48(%rdi), %xmm8
; SSE-NEXT: movaps 208(%rdi), %xmm10
; SSE-NEXT: movaps 192(%rdi), %xmm2
; SSE-NEXT: movaps 144(%rdi), %xmm11
; SSE-NEXT: movaps 128(%rdi), %xmm3
; SSE-NEXT: movaps 80(%rdi), %xmm12
; SSE-NEXT: movaps 64(%rdi), %xmm6
; SSE-NEXT: movaps 240(%rdi), %xmm13
; SSE-NEXT: movaps 224(%rdi), %xmm4
; SSE-NEXT: movaps 176(%rdi), %xmm15
; SSE-NEXT: movaps 160(%rdi), %xmm5
; SSE-NEXT: movaps 112(%rdi), %xmm1
; SSE-NEXT: movaps 96(%rdi), %xmm7
; SSE-NEXT: movaps %xmm7, %xmm0
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm1[1]
; SSE-NEXT: movaps %xmm5, %xmm1
; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm15[1]
; SSE-NEXT: movaps %xmm4, %xmm15
; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm13[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm13[1]
; SSE-NEXT: movaps %xmm2, %xmm13
; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm10[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm10[1]
; SSE-NEXT: movaps %xmm3, %xmm10
; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm11[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm11[1]
; SSE-NEXT: movaps %xmm6, %xmm11
; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm12[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm12[1]
; SSE-NEXT: movaps %xmm14, %xmm12
; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm8[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm8[1]
; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 16(%rdi), %xmm8
; SSE-NEXT: movaps %xmm9, %xmm14
; SSE-NEXT: movlhps {{.*#+}} xmm14 = xmm14[0],xmm8[0]
; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
; SSE-NEXT: movaps %xmm13, 96(%rsi)
; SSE-NEXT: movaps %xmm10, 64(%rsi)
; SSE-NEXT: movaps %xmm11, 32(%rsi)
; SSE-NEXT: movaps %xmm14, (%rsi)
; SSE-NEXT: movaps %xmm15, 112(%rsi)
; SSE-NEXT: movaps %xmm1, 80(%rsi)
; SSE-NEXT: movaps %xmm0, 48(%rsi)
; SSE-NEXT: movaps %xmm12, 16(%rsi)
; SSE-NEXT: movaps %xmm9, (%rdx)
; SSE-NEXT: movaps %xmm6, 32(%rdx)
; SSE-NEXT: movaps %xmm3, 64(%rdx)
; SSE-NEXT: movaps %xmm2, 96(%rdx)
; SSE-NEXT: movaps %xmm4, 112(%rdx)
; SSE-NEXT: movaps %xmm5, 80(%rdx)
; SSE-NEXT: movaps %xmm7, 48(%rdx)
; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; SSE-NEXT: movaps %xmm0, 16(%rdx)
; SSE-NEXT: retq
;
; AVX1-LABEL: load_i64_stride2_vf16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps (%rdi), %ymm0
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
; AVX1-NEXT: vmovaps 64(%rdi), %ymm2
; AVX1-NEXT: vmovaps 96(%rdi), %ymm3
; AVX1-NEXT: vmovaps 224(%rdi), %ymm4
; AVX1-NEXT: vmovaps 192(%rdi), %ymm5
; AVX1-NEXT: vmovaps 160(%rdi), %ymm6
; AVX1-NEXT: vmovaps 128(%rdi), %ymm7
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3],ymm6[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm6[0],ymm8[0],ymm6[2],ymm8[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3],ymm4[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[0,1],ymm4[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm2[2,3],ymm3[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[0,1],ymm3[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm10[0],ymm2[2],ymm10[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm6 = ymm6[1],ymm8[1],ymm6[3],ymm8[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm10[1],ymm2[3],ymm10[3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3]
; AVX1-NEXT: vmovaps %ymm5, 96(%rsi)
; AVX1-NEXT: vmovaps %ymm1, (%rsi)
; AVX1-NEXT: vmovaps %ymm3, 32(%rsi)
; AVX1-NEXT: vmovaps %ymm7, 64(%rsi)
; AVX1-NEXT: vmovaps %ymm6, 64(%rdx)
; AVX1-NEXT: vmovaps %ymm4, 96(%rdx)
; AVX1-NEXT: vmovaps %ymm0, (%rdx)
; AVX1-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_i64_stride2_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovaps (%rdi), %ymm0
Expand Down Expand Up @@ -119,6 +352,26 @@ define void @load_i64_stride2_vf16(<32 x i64>* %in.vec, <16 x i64>* %out.vec0, <
; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: load_i64_stride2_vf16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512-NEXT: vmovdqu64 128(%rdi), %zmm2
; AVX512-NEXT: vmovdqu64 192(%rdi), %zmm3
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,2,4,6,8,10,12,14]
; AVX512-NEXT: vmovdqa64 %zmm0, %zmm5
; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm5
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm4
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [1,3,5,7,9,11,13,15]
; AVX512-NEXT: vpermt2q %zmm1, %zmm6, %zmm0
; AVX512-NEXT: vpermt2q %zmm3, %zmm6, %zmm2
; AVX512-NEXT: vmovdqu64 %zmm4, 64(%rsi)
; AVX512-NEXT: vmovdqu64 %zmm5, (%rsi)
; AVX512-NEXT: vmovdqu64 %zmm2, 64(%rdx)
; AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%wide.vec = load <32 x i64>, <32 x i64>* %in.vec, align 32

%strided.vec0 = shufflevector <32 x i64> %wide.vec, <32 x i64> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
Expand Down

0 comments on commit 31d0c8f

Please sign in to comment.