50 changes: 25 additions & 25 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
Expand Down Expand Up @@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
; AVX512F-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm15 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
; AVX512F-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
Expand Down Expand Up @@ -615,7 +615,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm15, %xmm2
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
Expand All @@ -626,7 +626,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512F-FAST-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm10 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm14
Expand Down Expand Up @@ -1353,7 +1353,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vmovdqa64 %ymm1, %ymm19
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
; AVX512F-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
Expand Down Expand Up @@ -1490,7 +1490,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm1, %ymm24
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm11, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
Expand Down Expand Up @@ -1523,7 +1523,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-FAST-NEXT: vmovdqa64 %ymm0, %ymm21
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm18 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
; AVX512F-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
Expand Down Expand Up @@ -3238,7 +3238,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm11, %xmm0, %xmm7
; AVX512F-SLOW-NEXT: vmovdqa %xmm0, %xmm6
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm25, %xmm0
Expand Down Expand Up @@ -3409,7 +3409,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm11
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
Expand Down Expand Up @@ -3563,7 +3563,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm1 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa %xmm9, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm1, %xmm0
; AVX512F-FAST-NEXT: vmovdqa %xmm1, %xmm11
Expand Down Expand Up @@ -3630,7 +3630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm27 = [3,7,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
; AVX512F-FAST-NEXT: vmovdqa64 %xmm16, %xmm0
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
Expand Down Expand Up @@ -3733,7 +3733,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
; AVX512F-FAST-NEXT: vmovdqa %xmm5, %xmm0
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm4 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
Expand Down Expand Up @@ -7297,7 +7297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512F-SLOW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm0, %xmm16
Expand Down Expand Up @@ -7711,7 +7711,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512F-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
; AVX512F-SLOW-NEXT: vmovq {{.*#+}} xmm16 = [3,7,0,0]
; AVX512F-SLOW-NEXT: vmovdqa64 %xmm28, %xmm1
; AVX512F-SLOW-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
; AVX512F-SLOW-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
Expand Down Expand Up @@ -8050,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm9 = [1,5,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm30, %xmm0
; AVX512F-FAST-NEXT: vpermt2d %xmm17, %xmm9, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm17, %xmm30
Expand Down Expand Up @@ -8225,7 +8225,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm0 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512F-FAST-NEXT: vpermt2d %xmm28, %xmm0, %xmm1
; AVX512F-FAST-NEXT: vmovdqa64 %xmm0, %xmm16
Expand Down Expand Up @@ -8477,7 +8477,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm15 = [1,5,0,0]
; AVX512F-FAST-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
; AVX512F-FAST-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512F-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
Expand Down Expand Up @@ -8642,7 +8642,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512F-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-FAST-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
; AVX512F-FAST-NEXT: vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
; AVX512F-FAST-NEXT: vmovq {{.*#+}} xmm17 = [3,7,0,0]
; AVX512F-FAST-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
; AVX512F-FAST-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
; AVX512F-FAST-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
; AVX512-FAST-NEXT: vmovd {{.*#+}} xmm3 = [5,0,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm0
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
Expand Down
8 changes: 3 additions & 5 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovq %xmm2, (%rsi)
Expand Down Expand Up @@ -160,8 +160,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5
; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1,5,1,5]
; AVX2-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm7 = [1,5,0,0]
; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
Expand All @@ -171,8 +170,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [3,7,3,7]
; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [3,7,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsi)
Expand Down
24 changes: 9 additions & 15 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,10 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,2,4,2]
; AVX2-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = [4,2,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm3
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX2-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0
; AVX2-ONLY-NEXT: vmovlps %xmm4, (%rsi)
; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rdx)
Expand All @@ -118,13 +116,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpinsrd $1, %r10d, %xmm4, %xmm4
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm2 = [4,2,4,2]
; AVX512-SLOW-NEXT: # xmm2 = mem[0,0]
; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = [4,2,0,0]
; AVX512-SLOW-NEXT: vmovaps 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm2, %ymm2
; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512-SLOW-NEXT: # xmm6 = mem[0,0]
; AVX512-SLOW-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
; AVX512-SLOW-NEXT: vmovq %xmm3, (%rsi)
; AVX512-SLOW-NEXT: vmovq %xmm1, (%rdx)
Expand All @@ -147,15 +143,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm2, %xmm1, %xmm4
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm2
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [3,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm3, %xmm1, %xmm5
; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [4,2,4,2]
; AVX512-FAST-NEXT: # xmm1 = mem[0,0]
; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm1 = [4,2,0,0]
; AVX512-FAST-NEXT: vmovaps 32(%rdi), %ymm3
; AVX512-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1
; AVX512-FAST-NEXT: vmovddup {{.*#+}} xmm6 = [5,3,5,3]
; AVX512-FAST-NEXT: # xmm6 = mem[0,0]
; AVX512-FAST-NEXT: vmovsd {{.*#+}} xmm6 = [5,3,0,0]
; AVX512-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm3
; AVX512-FAST-NEXT: vmovq %xmm0, (%rsi)
; AVX512-FAST-NEXT: vmovq %xmm4, (%rdx)
Expand Down Expand Up @@ -308,13 +302,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
; AVX2-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm9 = [4,2,0,0]
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm9, %ymm2
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
; AVX2-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [5,3,0,0]
; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1
; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi)
Expand Down
45 changes: 19 additions & 26 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
; AVX2-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX2-ONLY-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4
; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
Expand Down Expand Up @@ -134,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
; AVX512-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
; AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,11,0,0]
; AVX512-SLOW-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX512-SLOW-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-SLOW-NEXT: vpermi2d %ymm5, %ymm6, %ymm1
Expand Down Expand Up @@ -165,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4
; AVX512-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm5 = [7,2,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm0, %xmm1, %xmm5
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,11,0,0]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-FAST-NEXT: vmovdqa (%rdi), %ymm6
; AVX512-FAST-NEXT: vpermi2d %ymm1, %ymm6, %ymm0
Expand Down Expand Up @@ -355,8 +354,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
; AVX2-SLOW-NEXT: # xmm10 = mem[0,0]
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
Expand Down Expand Up @@ -414,8 +412,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
; AVX2-FAST-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
Expand Down Expand Up @@ -472,8 +469,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm10 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: # xmm10 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm10 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
Expand Down Expand Up @@ -850,7 +846,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
Expand Down Expand Up @@ -954,7 +950,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
Expand Down Expand Up @@ -1058,7 +1054,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm10
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm11
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm12 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
Expand Down Expand Up @@ -1880,7 +1876,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10
Expand Down Expand Up @@ -2104,7 +2100,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [4,3,0,0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11
; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm12
Expand Down Expand Up @@ -2332,7 +2328,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm5 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12
; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10
Expand Down Expand Up @@ -4253,7 +4249,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-SLOW-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
Expand Down Expand Up @@ -4763,7 +4759,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
Expand Down Expand Up @@ -5274,7 +5270,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm1 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
Expand Down Expand Up @@ -9154,8 +9150,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
; AVX2-SLOW-NEXT: # xmm7 = mem[0,0]
; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
Expand Down Expand Up @@ -10195,8 +10190,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-NEXT: vmovaps 64(%rdi), %xmm9
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm4 = [4,3,4,3]
; AVX2-FAST-NEXT: # xmm4 = mem[0,0]
; AVX2-FAST-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
Expand Down Expand Up @@ -11245,8 +11239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
; AVX2-FAST-PERLANE-NEXT: vmovddup {{.*#+}} xmm7 = [4,3,4,3]
; AVX2-FAST-PERLANE-NEXT: # xmm7 = mem[0,0]
; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm7 = [4,3,0,0]
; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX512-FAST-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
; AVX512-FAST-NEXT: vmovq {{.*#+}} xmm3 = [1,5,0,0]
; AVX512-FAST-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
; AVX512-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FAST-NEXT: vmovdqa 32(%rdi), %ymm1
Expand Down
13 changes: 5 additions & 8 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
Expand All @@ -199,7 +198,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX2-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
Expand All @@ -215,7 +214,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512F-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
Expand Down Expand Up @@ -281,8 +280,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm6
; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpackuswb %xmm6, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
Expand Down Expand Up @@ -438,8 +436,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm8, %xmm12
; AVX1-ONLY-NEXT: vpand %xmm1, %xmm7, %xmm1
; AVX1-ONLY-NEXT: vpackuswb %xmm12, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
Expand Down
108 changes: 41 additions & 67 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
Original file line number Diff line number Diff line change
Expand Up @@ -214,57 +214,31 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movq %xmm1, (%r8)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX1-ONLY-NEXT: vmovq %xmm0, (%rsi)
; AVX1-ONLY-NEXT: vmovq %xmm3, (%rdx)
; AVX1-ONLY-NEXT: vmovq %xmm4, (%rcx)
; AVX1-ONLY-NEXT: vmovq %xmm1, (%r8)
; AVX1-ONLY-NEXT: retq
;
; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
; AVX2-ONLY: # %bb.0:
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-ONLY-NEXT: vmovq %xmm0, (%rsi)
; AVX2-ONLY-NEXT: vmovq %xmm3, (%rdx)
; AVX2-ONLY-NEXT: vmovq %xmm4, (%rcx)
; AVX2-ONLY-NEXT: vmovq %xmm1, (%r8)
; AVX2-ONLY-NEXT: retq
; AVX1-LABEL: load_i8_stride4_vf8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; AVX1-NEXT: vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-NEXT: vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm5
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX1-NEXT: vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX1-NEXT: vmovq %xmm0, (%rsi)
; AVX1-NEXT: vmovq %xmm3, (%rdx)
; AVX1-NEXT: vmovq %xmm4, (%rcx)
; AVX1-NEXT: vmovq %xmm1, (%r8)
; AVX1-NEXT: retq
;
; AVX512-LABEL: load_i8_stride4_vf8:
; AVX512: # %bb.0:
Expand Down Expand Up @@ -413,7 +387,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
Expand All @@ -422,7 +396,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
Expand All @@ -431,7 +405,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
Expand All @@ -440,7 +414,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Expand All @@ -461,7 +435,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
Expand All @@ -470,7 +444,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
Expand All @@ -479,7 +453,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX2-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm1, %xmm8
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
Expand All @@ -488,7 +462,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm3
; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX2-ONLY-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX2-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Expand Down Expand Up @@ -780,7 +754,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm9 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm4
; AVX1-ONLY-NEXT: vmovdqa 64(%rdi), %xmm5
Expand All @@ -804,7 +778,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm0, %xmm10
; AVX1-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm11
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm11 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm13
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
Expand All @@ -822,7 +796,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm0, %xmm11
; AVX1-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm12
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm3, %xmm13
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm5, %xmm14
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
Expand All @@ -840,7 +814,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
Expand Down Expand Up @@ -1545,7 +1519,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm1, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqa %xmm3, %xmm8
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm5
Expand Down Expand Up @@ -1611,7 +1585,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
Expand Down Expand Up @@ -1662,7 +1636,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm8, %xmm5
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
Expand Down Expand Up @@ -1705,7 +1679,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX1-ONLY-NEXT: vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
Expand Down
82 changes: 28 additions & 54 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -150,47 +150,26 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movd %xmm3, (%r9)
; SSE-NEXT: retq
;
; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
; AVX1-ONLY: # %bb.0:
; AVX1-ONLY-NEXT: vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1
; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
; AVX1-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX1-ONLY-NEXT: vmovd %xmm3, (%rsi)
; AVX1-ONLY-NEXT: vmovd %xmm4, (%rdx)
; AVX1-ONLY-NEXT: vmovd %xmm5, (%rcx)
; AVX1-ONLY-NEXT: vmovd %xmm6, (%r8)
; AVX1-ONLY-NEXT: vmovd %xmm0, (%r9)
; AVX1-ONLY-NEXT: retq
;
; AVX2-LABEL: load_i8_stride5_vf4:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm3
; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovd %xmm3, (%rsi)
; AVX2-NEXT: vmovd %xmm4, (%rdx)
; AVX2-NEXT: vmovd %xmm5, (%rcx)
; AVX2-NEXT: vmovd %xmm6, (%r8)
; AVX2-NEXT: vmovd %xmm0, (%r9)
; AVX2-NEXT: retq
; AVX-LABEL: load_i8_stride5_vf4:
; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vmovdqa (%rdi), %xmm1
; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm3
; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm6
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
; AVX-NEXT: vmovd %xmm3, (%rsi)
; AVX-NEXT: vmovd %xmm4, (%rdx)
; AVX-NEXT: vmovd %xmm5, (%rcx)
; AVX-NEXT: vmovd %xmm6, (%r8)
; AVX-NEXT: vmovd %xmm0, (%r9)
; AVX-NEXT: retq
%wide.vec = load <20 x i8>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
%strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
Expand Down Expand Up @@ -707,7 +686,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm4, %xmm5
Expand Down Expand Up @@ -1551,7 +1530,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm11, %xmm12, %xmm11
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm11
Expand Down Expand Up @@ -3169,7 +3148,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa %xmm7, %xmm12
; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-ONLY-NEXT: vpor %xmm0, %xmm6, %xmm6
; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
; AVX1-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovdqa 176(%rdi), %xmm1
Expand All @@ -3191,11 +3170,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX1-ONLY-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
; AVX1-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm11, %xmm6
; AVX1-ONLY-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
Expand Down Expand Up @@ -3351,8 +3328,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
; AVX1-ONLY-NEXT: # xmm14 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm14, %xmm15, %xmm0
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
Expand Down Expand Up @@ -3485,17 +3461,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX1-ONLY-NEXT: vpor %xmm7, %xmm4, %xmm4
; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm6 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm7, %xmm12, %xmm12
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm12 = [12,128,128,128,0,0,2,7,12,128,128,128,0,0,2,7]
; AVX1-ONLY-NEXT: # xmm12 = mem[0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm13
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm15, %xmm15
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
Expand Down
91 changes: 34 additions & 57 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll

Large diffs are not rendered by default.

122 changes: 53 additions & 69 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll

Large diffs are not rendered by default.

200 changes: 100 additions & 100 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-SLOW-NEXT: shrq $48, %rax
; AVX2-SLOW-NEXT: vmovd %eax, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-SLOW-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-SLOW-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9)
Expand Down Expand Up @@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-NEXT: shrq $48, %rax
; AVX2-FAST-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-NEXT: vmovdqa %ymm2, (%r9)
Expand Down Expand Up @@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FAST-PERLANE-NEXT: shrq $48, %rax
; AVX2-FAST-PERLANE-NEXT: vmovd %eax, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm1, %xmm1
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
; AVX2-FAST-PERLANE-NEXT: vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX2-FAST-PERLANE-NEXT: vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
; AVX2-FAST-PERLANE-NEXT: vmovq %xmm0, 32(%r9)
; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
; AVX1-ONLY-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0]
; AVX1-ONLY-NEXT: # xmm7 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
Expand Down
30 changes: 10 additions & 20 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -187,13 +187,11 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15]
; AVX1-ONLY-NEXT: # xmm2 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm1, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13]
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
Expand Down Expand Up @@ -361,47 +359,39 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11]
; AVX1-ONLY-NEXT: # xmm4 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX1-ONLY-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0]
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm1, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm0, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9]
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15]
; AVX1-ONLY-NEXT: # xmm5 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX1-ONLY-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13]
; AVX1-ONLY-NEXT: # xmm6 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0]
; AVX1-ONLY-NEXT: # xmm3 = mem[0,0]
; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
Expand Down
112 changes: 56 additions & 56 deletions llvm/test/CodeGen/X86/vector-lzcnt-128.ll

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/vector-lzcnt-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -199,7 +199,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
; AVX1-LABEL: testv4i64u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm4
; AVX1-NEXT: vpsrlw $4, %xmm2, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -385,7 +385,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -541,7 +541,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
; AVX1-LABEL: testv8i32u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -697,7 +697,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -818,7 +818,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
; AVX1-LABEL: testv16i16u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm4
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -939,7 +939,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down Expand Up @@ -1035,7 +1035,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
; AVX1-LABEL: testv32i8u:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
define <4 x i16> @smulfix(<4 x i16> %a) {
; CHECK-LABEL: smulfix:
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
Expand All @@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
define <4 x i16> @umulfix(<4 x i16> %a) {
; CHECK-LABEL: umulfix:
; CHECK: # %bb.0:
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
; CHECK-NEXT: movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
; CHECK-NEXT: movdqa %xmm0, %xmm2
; CHECK-NEXT: pmullw %xmm1, %xmm2
; CHECK-NEXT: psrlw $15, %xmm2
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
;
; SSE41-LABEL: test_v4i16_v4i8:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
Expand Down Expand Up @@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
; AVX512BW-LABEL: test_v4i16_v4i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
; AVX512BW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/vector-rotate-128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; X86-SSE2-LABEL: splatvar_rotate_v2i64:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [64,0,0,0]
; X86-SSE2-NEXT: movd {{.*#+}} xmm2 = [64,0,0,0]
; X86-SSE2-NEXT: psubq %xmm1, %xmm2
; X86-SSE2-NEXT: movdqa %xmm0, %xmm3
; X86-SSE2-NEXT: psllq %xmm1, %xmm3
Expand Down Expand Up @@ -815,7 +815,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; SSE41-LABEL: splatvar_rotate_v8i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movd {{.*#+}} xmm2 = [15,0,0,0]
; SSE41-NEXT: movdqa %xmm1, %xmm3
; SSE41-NEXT: pandn %xmm2, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm4
Expand All @@ -828,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX-LABEL: splatvar_rotate_v8i16:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
Expand All @@ -839,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512F-LABEL: splatvar_rotate_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512F-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512F-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
Expand All @@ -850,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VL-LABEL: splatvar_rotate_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
Expand All @@ -861,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512BW-LABEL: splatvar_rotate_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512BW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512BW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512BW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
Expand All @@ -872,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX512VLBW-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX512VLBW-NEXT: vpsrlw $1, %xmm0, %xmm4
; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm4, %xmm3
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/X86/vector-rotate-256.ll
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v16i16:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpandn %xmm2, %xmm1, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm5
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u]
; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: psraw $1, %xmm0
Expand Down Expand Up @@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
;
; SSE41-LABEL: constant_shift_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
; SSE41-NEXT: movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
; SSE41-NEXT: pmulhuw %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
; SSE41-NEXT: retq
Expand All @@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v4i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down Expand Up @@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
; AVX512BW-LABEL: constant_shift_v2i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
Expand Down
11 changes: 5 additions & 6 deletions llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1003,40 +1003,39 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
;
; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSSE3-NEXT: pshufb %xmm2, %xmm1
; SSSE3-NEXT: pshufb %xmm2, %xmm0
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pshufb %xmm2, %xmm1
; SSE41-NEXT: pshufb %xmm2, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
; AVX512VLBW-NEXT: vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
Expand Down Expand Up @@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
; AVX512VL-NEXT: vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
Expand Down Expand Up @@ -621,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
Expand Down Expand Up @@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
;
; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
; AVX512VL-NEXT: vmovd {{.*#+}} xmm1 = [15,0,0,0]
; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
Expand Down
34 changes: 16 additions & 18 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1281,7 +1281,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1328,7 +1328,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1375,7 +1375,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1469,7 +1469,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1516,7 +1516,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1563,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [31,0,0,0]
; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm1 = [31,0,0,0]
; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
; AVX512VLVBMI-NEXT: retq
;
Expand Down Expand Up @@ -2565,12 +2565,10 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8]
; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
; AVX1-NEXT: # xmm5 = mem[0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
Expand Down Expand Up @@ -2771,7 +2769,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -2785,7 +2783,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -2803,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -2817,7 +2815,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -2835,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand All @@ -2849,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
; XOPAVX1-NEXT: vmovd {{.*#+}} xmm2 = [15,0,0,0]
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; XOPAVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand All @@ -225,7 +225,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down Expand Up @@ -1808,7 +1808,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vmovsd {{.*#+}} xmm1 = [0,6,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand All @@ -1825,7 +1825,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,13 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; KNL: ## %bb.0:
; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
; KNL-NEXT: vmovd {{.*#+}} xmm1 = [65535,0,0,0]
; KNL-NEXT: vpandq %zmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; SKX: ## %bb.0:
; SKX-NEXT: vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0]
; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0
; SKX-NEXT: retq
%shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0]
; AVX512DQ-NEXT: vandps %zmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
; AVX512VBMI-NEXT: vmovd {{.*#+}} xmm1 = [255,0,0,0]
; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512VBMI-NEXT: retq
%shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_70000000:
; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down Expand Up @@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
; ALL-LABEL: shuffle_v8i64_70000000:
; ALL: # %bb.0:
; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vmovss {{.*#+}} xmm1 = [7,0,0,0]
; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; ALL-NEXT: ret{{[l|q]}}
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
Expand Down
56 changes: 20 additions & 36 deletions llvm/test/CodeGen/X86/vector-shuffle-combining.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1744,26 +1744,18 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test1c:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test1c:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX-LABEL: combine_test1c:
; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
Expand Down Expand Up @@ -1838,26 +1830,18 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: combine_test4c:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_test4c:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
; AVX-LABEL: combine_test4c:
; AVX: # %bb.0:
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX-NEXT: vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%A = load <4 x i8>, ptr %a
%B = load <4 x i8>, ptr %b
%1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
Expand Down Expand Up @@ -3187,7 +3171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
define void @PR43024() {
; SSE2-LABEL: PR43024:
; SSE2: # %bb.0:
; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE2-NEXT: movaps %xmm0, (%rax)
; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
Expand All @@ -3198,7 +3182,7 @@ define void @PR43024() {
;
; SSSE3-LABEL: PR43024:
; SSSE3: # %bb.0:
; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSSE3-NEXT: movaps %xmm0, (%rax)
; SSSE3-NEXT: addss %xmm0, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
Expand All @@ -3209,7 +3193,7 @@ define void @PR43024() {
;
; SSE41-LABEL: PR43024:
; SSE41: # %bb.0:
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE41-NEXT: movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; SSE41-NEXT: movaps %xmm0, (%rax)
; SSE41-NEXT: addss %xmm0, %xmm0
; SSE41-NEXT: xorps %xmm1, %xmm1
Expand All @@ -3220,7 +3204,7 @@ define void @PR43024() {
;
; AVX-LABEL: PR43024:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
Expand Down
Loading