131 changes: 47 additions & 84 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -116,83 +116,44 @@ define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovd %xmm1, (%r9)
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i16_stride5_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovd %xmm3, (%rdx)
; AVX512F-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512F-SLOW-NEXT: vmovd %xmm0, (%r8)
; AVX512F-SLOW-NEXT: vmovd %xmm1, (%r9)
; AVX512F-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i16_stride5_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512F-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512F-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512F-FAST-NEXT: vmovd %xmm3, (%rdx)
; AVX512F-FAST-NEXT: vmovd %xmm4, (%rcx)
; AVX512F-FAST-NEXT: vmovd %xmm0, (%r8)
; AVX512F-FAST-NEXT: vmovd %xmm1, (%r9)
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i16_stride5_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovd %xmm3, (%rdx)
; AVX512BW-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512BW-SLOW-NEXT: vmovd %xmm0, (%r8)
; AVX512BW-SLOW-NEXT: vmovd %xmm1, (%r9)
; AVX512BW-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i16_stride5_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512-SLOW-NEXT: vmovd %xmm3, (%rdx)
; AVX512-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512-SLOW-NEXT: vmovd %xmm0, (%r8)
; AVX512-SLOW-NEXT: vmovd %xmm1, (%r9)
; AVX512-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i16_stride5_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512BW-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovd %xmm3, (%rdx)
; AVX512BW-FAST-NEXT: vmovd %xmm4, (%rcx)
; AVX512BW-FAST-NEXT: vmovd %xmm0, (%r8)
; AVX512BW-FAST-NEXT: vmovd %xmm1, (%r9)
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i16_stride5_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FAST-NEXT: vpsrlq $48, %xmm0, %xmm0
; AVX512-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vpbroadcastw 8(%rdi), %xmm5
; AVX512-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7]
; AVX512-FAST-NEXT: vmovd %xmm2, (%rsi)
; AVX512-FAST-NEXT: vmovd %xmm3, (%rdx)
; AVX512-FAST-NEXT: vmovd %xmm4, (%rcx)
; AVX512-FAST-NEXT: vmovd %xmm0, (%r8)
; AVX512-FAST-NEXT: vmovd %xmm1, (%r9)
; AVX512-FAST-NEXT: retq
%wide.vec = load <10 x i16>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 0, i32 5>
%strided.vec1 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 1, i32 6>
Expand Down Expand Up @@ -7766,8 +7727,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2: {{.*}}
; AVX2-ONLY: {{.*}}
; AVX512: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
Expand Down
97 changes: 36 additions & 61 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -150,32 +150,32 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax)
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i16_stride6_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512F-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-SLOW-NEXT: vmovd %xmm3, (%rsi)
; AVX512F-SLOW-NEXT: vmovd %xmm2, (%rdx)
; AVX512F-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512F-SLOW-NEXT: vmovd %xmm5, (%r8)
; AVX512F-SLOW-NEXT: vmovd %xmm6, (%r9)
; AVX512F-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i16_stride6_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm1
; AVX512-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-SLOW-NEXT: vmovd %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovd %xmm2, (%rdx)
; AVX512-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512-SLOW-NEXT: vmovd %xmm5, (%r8)
; AVX512-SLOW-NEXT: vmovd %xmm6, (%r9)
; AVX512-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i16_stride6_vf2:
; AVX512F-FAST: # %bb.0:
Expand Down Expand Up @@ -203,33 +203,6 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax)
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i16_stride6_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw 4(%rdi), %xmm4
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw 20(%rdi), %xmm6
; AVX512BW-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512BW-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm1
; AVX512BW-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vmovd %xmm3, (%rsi)
; AVX512BW-SLOW-NEXT: vmovd %xmm2, (%rdx)
; AVX512BW-SLOW-NEXT: vmovd %xmm4, (%rcx)
; AVX512BW-SLOW-NEXT: vmovd %xmm5, (%r8)
; AVX512BW-SLOW-NEXT: vmovd %xmm6, (%r9)
; AVX512BW-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i16_stride6_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
Expand Down Expand Up @@ -11906,8 +11879,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2: {{.*}}
; AVX2-ONLY: {{.*}}
; AVX512: {{.*}}
; AVX512-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
Expand Down
106 changes: 39 additions & 67 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -166,35 +166,35 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vmovd %xmm0, (%rax)
; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i16_stride7_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
; AVX512F-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512F-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512F-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512F-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovd %xmm4, (%rdx)
; AVX512F-SLOW-NEXT: vmovd %xmm6, (%rcx)
; AVX512F-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
; AVX512F-SLOW-NEXT: vmovd %xmm7, (%r9)
; AVX512F-SLOW-NEXT: vmovd %xmm3, (%r10)
; AVX512F-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i16_stride7_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
; AVX512-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512-SLOW-NEXT: vmovd %xmm4, (%rdx)
; AVX512-SLOW-NEXT: vmovd %xmm6, (%rcx)
; AVX512-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
; AVX512-SLOW-NEXT: vmovd %xmm7, (%r9)
; AVX512-SLOW-NEXT: vmovd %xmm3, (%r10)
; AVX512-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i16_stride7_vf2:
; AVX512F-FAST: # %bb.0:
Expand Down Expand Up @@ -224,36 +224,6 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vmovd %xmm0, (%rax)
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i16_stride7_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpsrld $16, %xmm0, %xmm3
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,1,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw 8(%rdi), %xmm7
; AVX512BW-SLOW-NEXT: vpsrlq $48, %xmm1, %xmm8
; AVX512BW-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512BW-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX512BW-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512BW-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512BW-SLOW-NEXT: vmovd %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovd %xmm4, (%rdx)
; AVX512BW-SLOW-NEXT: vmovd %xmm6, (%rcx)
; AVX512BW-SLOW-NEXT: vpextrd $2, %xmm5, (%r8)
; AVX512BW-SLOW-NEXT: vmovd %xmm7, (%r9)
; AVX512BW-SLOW-NEXT: vmovd %xmm3, (%r10)
; AVX512BW-SLOW-NEXT: vmovd %xmm0, (%rax)
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i16_stride7_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
Expand Down Expand Up @@ -16204,8 +16174,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2: {{.*}}
; AVX2-ONLY: {{.*}}
; AVX512: {{.*}}
; AVX512-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
Expand Down
18 changes: 10 additions & 8 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -9893,6 +9893,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512: {{.*}}
; AVX512-FAST: {{.*}}
; AVX512-SLOW: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
Expand Down
82 changes: 32 additions & 50 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -124,49 +124,27 @@ define void @load_i32_stride2_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride2_vf8:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm1
; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX512F-SLOW-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512F-SLOW-NEXT: vmovaps %ymm1, (%rdx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride2_vf8:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-SLOW-NEXT: vmovaps (%rdi), %ymm1
; AVX512-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX512-SLOW-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512-SLOW-NEXT: vmovaps %ymm1, (%rdx)
; AVX512-SLOW-NEXT: vzeroupper
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride2_vf8:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
; AVX512F-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
; AVX512F-FAST-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512F-FAST-NEXT: vmovdqa %ymm2, (%rdx)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride2_vf8:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm1
; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3],mem[1,3],ymm1[5,7],mem[5,7]
; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX512BW-SLOW-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512BW-SLOW-NEXT: vmovaps %ymm1, (%rdx)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride2_vf8:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
; AVX512BW-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
; AVX512BW-FAST-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovdqa %ymm2, (%rdx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride2_vf8:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm1
; AVX512-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15]
; AVX512-FAST-NEXT: vpermi2d 32(%rdi), %ymm1, %ymm2
; AVX512-FAST-NEXT: vpmovqd %zmm0, (%rsi)
; AVX512-FAST-NEXT: vmovdqa %ymm2, (%rdx)
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
%wide.vec = load <16 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
Expand Down Expand Up @@ -761,15 +739,19 @@ define void @load_i32_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
100 changes: 38 additions & 62 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -59,61 +59,33 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vmovlps %xmm1, (%rcx)
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride3_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512F-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512F-SLOW-NEXT: vmovlps %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovlps %xmm0, (%rdx)
; AVX512F-SLOW-NEXT: vmovlps %xmm1, (%rcx)
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride3_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX512-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512-SLOW-NEXT: vmovlps %xmm2, (%rsi)
; AVX512-SLOW-NEXT: vmovlps %xmm0, (%rdx)
; AVX512-SLOW-NEXT: vmovlps %xmm1, (%rcx)
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride3_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512F-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride3_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
; AVX512BW-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3
; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512BW-SLOW-NEXT: vmovlps %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovlps %xmm0, (%rdx)
; AVX512BW-SLOW-NEXT: vmovlps %xmm1, (%rcx)
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride3_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride3_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FAST-NEXT: retq
%wide.vec = load <6 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3>
%strided.vec1 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4>
Expand Down Expand Up @@ -3049,15 +3021,19 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX1: {{.*}}
; AVX2: {{.*}}
; AVX512BW: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
100 changes: 38 additions & 62 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -59,61 +59,33 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride4_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512F-SLOW-NEXT: vmovq %xmm0, (%rcx)
; AVX512F-SLOW-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride4_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512-SLOW-NEXT: vmovq %xmm0, (%rcx)
; AVX512-SLOW-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride4_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512F-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512F-FAST-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride4_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
; AVX512BW-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-SLOW-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride4_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512BW-FAST-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride4_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512-FAST-NEXT: vmovq %xmm0, (%rcx)
; AVX512-FAST-NEXT: vpextrq $1, %xmm0, (%r8)
; AVX512-FAST-NEXT: retq
%wide.vec = load <8 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 0, i32 4>
%strided.vec1 = shufflevector <8 x i32> %wide.vec, <8 x i32> poison, <2 x i32> <i32 1, i32 5>
Expand Down Expand Up @@ -3466,15 +3438,19 @@ define void @load_i32_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
148 changes: 54 additions & 94 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -78,93 +78,49 @@ define void @load_i32_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride5_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vpextrd $2, %xmm1, %eax
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX512F-SLOW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5
; AVX512F-SLOW-NEXT: vpextrd $3, %xmm1, %eax
; AVX512F-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
; AVX512F-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512F-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rdx)
; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rcx)
; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512F-SLOW-NEXT: vmovq %xmm2, (%r9)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride5_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-SLOW-NEXT: vpextrd $2, %xmm1, %eax
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
; AVX512-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5
; AVX512-SLOW-NEXT: vpextrd $3, %xmm1, %eax
; AVX512-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
; AVX512-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm4, (%rdx)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%rcx)
; AVX512-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512-SLOW-NEXT: vmovq %xmm2, (%r9)
; AVX512-SLOW-NEXT: vzeroupper
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride5_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5
; AVX512F-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512F-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX512F-FAST-NEXT: vmovq %xmm3, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512F-FAST-NEXT: vmovq %xmm5, (%rcx)
; AVX512F-FAST-NEXT: vmovq %xmm0, (%r8)
; AVX512F-FAST-NEXT: vmovq %xmm1, (%r9)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride5_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512BW-SLOW-NEXT: vpextrd $2, %xmm1, %eax
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
; AVX512BW-SLOW-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4
; AVX512BW-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm5
; AVX512BW-SLOW-NEXT: vpextrd $3, %xmm1, %eax
; AVX512BW-SLOW-NEXT: vpinsrd $1, %eax, %xmm5, %xmm1
; AVX512BW-SLOW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512BW-SLOW-NEXT: vpbroadcastd 16(%rdi), %ymm5
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm4, (%rdx)
; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%rcx)
; AVX512BW-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%r9)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride5_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5
; AVX512BW-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512BW-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512BW-FAST-NEXT: vmovq %xmm5, (%rcx)
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%r8)
; AVX512BW-FAST-NEXT: vmovq %xmm1, (%r9)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride5_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,6,1,6]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm4
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [2,7,2,7]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm5
; AVX512-FAST-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX512-FAST-NEXT: vpbroadcastd 16(%rdi), %ymm1
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX512-FAST-NEXT: vmovq %xmm3, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512-FAST-NEXT: vmovq %xmm5, (%rcx)
; AVX512-FAST-NEXT: vmovq %xmm0, (%r8)
; AVX512-FAST-NEXT: vmovq %xmm1, (%r9)
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
%wide.vec = load <10 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <10 x i32> %wide.vec, <10 x i32> poison, <2 x i32> <i32 0, i32 5>
%strided.vec1 = shufflevector <10 x i32> %wide.vec, <10 x i32> poison, <2 x i32> <i32 1, i32 6>
Expand Down Expand Up @@ -5295,14 +5251,18 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST: {{.*}}
; AVX2-FAST-PERLANE: {{.*}}
; AVX2-SLOW: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
170 changes: 71 additions & 99 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -102,69 +102,69 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride6_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-SLOW-NEXT: vextractps $2, %xmm1, %r10d
; AVX512F-SLOW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
; AVX512F-SLOW-NEXT: vextractps $3, %xmm1, %r10d
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512F-SLOW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512F-SLOW-NEXT: vmovd %xmm2, %r10d
; AVX512F-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
; AVX512F-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512F-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
; AVX512F-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512F-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512F-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm1, (%rdx)
; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rcx)
; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512F-SLOW-NEXT: vmovlps %xmm2, (%r9)
; AVX512F-SLOW-NEXT: vmovlps %xmm5, (%rax)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride6_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovaps 16(%rdi), %xmm1
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-SLOW-NEXT: vextractps $2, %xmm1, %r10d
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm0, %xmm3
; AVX512-SLOW-NEXT: vextractps $3, %xmm1, %r10d
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm1, %xmm1
; AVX512-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-SLOW-NEXT: vmovd %xmm2, %r10d
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
; AVX512-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%rdx)
; AVX512-SLOW-NEXT: vmovq %xmm4, (%rcx)
; AVX512-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512-SLOW-NEXT: vmovlps %xmm2, (%r9)
; AVX512-SLOW-NEXT: vmovlps %xmm5, (%rax)
; AVX512-SLOW-NEXT: vzeroupper
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride6_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512F-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
; AVX512F-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512F-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512F-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512F-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512F-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
; AVX512F-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512F-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512F-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512F-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512F-FAST-NEXT: vmovq %xmm2, (%rcx)
; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512F-FAST-NEXT: vmovlps %xmm1, (%r9)
; AVX512F-FAST-NEXT: vmovlps %xmm3, (%rax)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride6_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512-FAST-NEXT: vmovq %xmm2, (%rcx)
; AVX512-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512-FAST-NEXT: vmovlps %xmm1, (%r9)
; AVX512-FAST-NEXT: vmovlps %xmm3, (%rax)
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride6_vf2:
; AVX512BW-SLOW: # %bb.0:
Expand Down Expand Up @@ -198,37 +198,6 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-SLOW-NEXT: vmovlps %xmm5, (%rax)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride6_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,6,0,6]
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm1
; AVX512BW-FAST-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,7,1,7]
; AVX512BW-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512BW-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512BW-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512BW-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512BW-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
; AVX512BW-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512BW-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512BW-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rdx)
; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rcx)
; AVX512BW-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512BW-FAST-NEXT: vmovlps %xmm1, (%r9)
; AVX512BW-FAST-NEXT: vmovlps %xmm3, (%rax)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
%wide.vec = load <12 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 0, i32 6>
%strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <2 x i32> <i32 1, i32 7>
Expand Down Expand Up @@ -10329,14 +10298,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX: {{.*}}
; AVX1: {{.*}}
; AVX2: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
223 changes: 79 additions & 144 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-SLOW,FALLBACK2
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST,FALLBACK3
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX1,AVX2,AVX2-ONLY,AVX2-FAST-PERLANE,FALLBACK4
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512F-ONLY-SLOW,FALLBACK5
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512F-ONLY-FAST,FALLBACK6
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-SLOW,AVX512F-SLOW,AVX512DQ-SLOW,FALLBACK7
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512F,AVX512-FAST,AVX512F-FAST,AVX512DQ-FAST,FALLBACK8
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512BW-ONLY-SLOW,FALLBACK9
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512BW-ONLY-FAST,FALLBACK10
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-SLOW,AVX512BW-SLOW,AVX512DQBW-SLOW,FALLBACK11
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX512,AVX512BW,AVX512-FAST,AVX512BW-FAST,AVX512DQBW-FAST,FALLBACK12

; These patterns are produced by LoopVectorizer for interleaved loads.

Expand Down Expand Up @@ -120,143 +120,74 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vzeroupper
; AVX2-ONLY-NEXT: retq
;
; AVX512F-SLOW-LABEL: load_i32_stride7_vf2:
; AVX512F-SLOW: # %bb.0:
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512F-SLOW-NEXT: vmovd %xmm1, %r11d
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512F-SLOW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512F-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512F-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7]
; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512F-SLOW-NEXT: vmovq %xmm4, (%rcx)
; AVX512F-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512F-SLOW-NEXT: vmovq %xmm1, (%r9)
; AVX512F-SLOW-NEXT: vmovq %xmm7, (%r10)
; AVX512F-SLOW-NEXT: vmovq %xmm5, (%rax)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
; AVX512-SLOW-LABEL: load_i32_stride7_vf2:
; AVX512-SLOW: # %bb.0:
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-SLOW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512-SLOW-NEXT: vmovd %xmm1, %r11d
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512-SLOW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
; AVX512-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; AVX512-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7]
; AVX512-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
; AVX512-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512-SLOW-NEXT: vmovq %xmm4, (%rcx)
; AVX512-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%r9)
; AVX512-SLOW-NEXT: vmovq %xmm7, (%r10)
; AVX512-SLOW-NEXT: vmovq %xmm5, (%rax)
; AVX512-SLOW-NEXT: vzeroupper
; AVX512-SLOW-NEXT: retq
;
; AVX512F-FAST-LABEL: load_i32_stride7_vf2:
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512F-FAST-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
; AVX512F-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512F-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512F-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
; AVX512F-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512F-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
; AVX512F-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512F-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
; AVX512F-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1
; AVX512F-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512F-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512F-FAST-NEXT: vmovq %xmm4, (%rcx)
; AVX512F-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512F-FAST-NEXT: vmovq %xmm0, (%r9)
; AVX512F-FAST-NEXT: vmovq %xmm7, (%r10)
; AVX512F-FAST-NEXT: vmovq %xmm1, (%rax)
; AVX512F-FAST-NEXT: vzeroupper
; AVX512F-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: load_i32_stride7_vf2:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-SLOW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512BW-SLOW-NEXT: vmovd %xmm1, %r11d
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512BW-SLOW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
; AVX512BW-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; AVX512BW-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512BW-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512BW-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7]
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,3,2,3,6,7,6,7]
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-SLOW-NEXT: vmovq %xmm4, (%rcx)
; AVX512BW-SLOW-NEXT: vmovq %xmm0, (%r8)
; AVX512BW-SLOW-NEXT: vmovq %xmm1, (%r9)
; AVX512BW-SLOW-NEXT: vmovq %xmm7, (%r10)
; AVX512BW-SLOW-NEXT: vmovq %xmm5, (%rax)
; AVX512BW-SLOW-NEXT: vzeroupper
; AVX512BW-SLOW-NEXT: retq
;
; AVX512BW-FAST-LABEL: load_i32_stride7_vf2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512BW-FAST-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
; AVX512BW-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512BW-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512BW-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
; AVX512BW-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512BW-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
; AVX512BW-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512BW-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
; AVX512BW-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512BW-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1
; AVX512BW-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512BW-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512BW-FAST-NEXT: vmovq %xmm4, (%rcx)
; AVX512BW-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512BW-FAST-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-FAST-NEXT: vmovq %xmm7, (%r10)
; AVX512BW-FAST-NEXT: vmovq %xmm1, (%rax)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
; AVX512-FAST-LABEL: load_i32_stride7_vf2:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FAST-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FAST-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,4,1,4]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [13,4,6,7,13,4,6,7]
; AVX512-FAST-NEXT: # ymm7 = mem[0,1,0,1]
; AVX512-FAST-NEXT: vpermi2d %ymm6, %ymm1, %ymm7
; AVX512-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [6,13,6,7,6,13,6,7]
; AVX512-FAST-NEXT: # ymm8 = mem[0,1,0,1]
; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm8
; AVX512-FAST-NEXT: vextracti128 $1, %ymm8, %xmm1
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm3, (%rdx)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rcx)
; AVX512-FAST-NEXT: vmovq %xmm5, (%r8)
; AVX512-FAST-NEXT: vmovq %xmm0, (%r9)
; AVX512-FAST-NEXT: vmovq %xmm7, (%r10)
; AVX512-FAST-NEXT: vmovq %xmm1, (%rax)
; AVX512-FAST-NEXT: vzeroupper
; AVX512-FAST-NEXT: retq
%wide.vec = load <14 x i32>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 0, i32 7>
%strided.vec1 = shufflevector <14 x i32> %wide.vec, <14 x i32> poison, <2 x i32> <i32 1, i32 8>
Expand Down Expand Up @@ -12700,14 +12631,18 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX: {{.*}}
; AVX1: {{.*}}
; AVX2: {{.*}}
; AVX512BW-FAST: {{.*}}
; AVX512BW-ONLY-FAST: {{.*}}
; AVX512BW-ONLY-SLOW: {{.*}}
; AVX512BW-SLOW: {{.*}}
; AVX512DQ-FAST: {{.*}}
; AVX512DQ-SLOW: {{.*}}
; AVX512DQBW-FAST: {{.*}}
; AVX512DQBW-SLOW: {{.*}}
; AVX512F-FAST: {{.*}}
; AVX512F-ONLY-FAST: {{.*}}
; AVX512F-ONLY-SLOW: {{.*}}
; AVX512F-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
Expand Down
Loading