119 changes: 49 additions & 70 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1119,7 +1119,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE: # %bb.0:
; SSE-NEXT: movdqa 64(%rdi), %xmm9
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa 16(%rdi), %xmm6
; SSE-NEXT: movdqa 16(%rdi), %xmm15
; SSE-NEXT: movdqa 32(%rdi), %xmm10
; SSE-NEXT: movdqa 48(%rdi), %xmm2
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
Expand All @@ -1143,8 +1143,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255]
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; SSE-NEXT: movdqa %xmm4, %xmm5
; SSE-NEXT: pandn %xmm6, %xmm5
; SSE-NEXT: movdqa %xmm6, %xmm15
; SSE-NEXT: pandn %xmm15, %xmm5
; SSE-NEXT: movdqa %xmm1, %xmm6
; SSE-NEXT: movdqa %xmm1, %xmm13
; SSE-NEXT: pand %xmm4, %xmm6
Expand Down Expand Up @@ -5210,45 +5209,38 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX: # %bb.0:
; AVX-NEXT: subq $488, %rsp # imm = 0x1E8
; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,9,14,0,4,9,14,0,4,9,14,0,4,9,14,0]
; AVX-NEXT: vmovdqa (%rdi), %xmm4
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vmovdqa 32(%rdi), %xmm7
; AVX-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm0
; AVX-NEXT: vmovdqa %xmm1, %xmm11
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa (%rdi), %xmm8
; AVX-NEXT: vmovdqa 16(%rdi), %xmm11
; AVX-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX-NEXT: vmovdqa 48(%rdi), %xmm9
; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm0
; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
; AVX-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX-NEXT: vmovdqa %xmm4, %xmm8
; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1
; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX-NEXT: vmovddup {{.*#+}} xmm4 = [128,128,2,7,12,0,0,128,128,128,2,7,12,0,0,128]
; AVX-NEXT: # xmm4 = mem[0,0]
; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm0
; AVX-NEXT: vmovdqa %xmm5, %xmm9
; AVX-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm4, %xmm9, %xmm0
; AVX-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [8,13,128,128,128,0,0,3,8,13,128,128,128,0,0,3]
; AVX-NEXT: # xmm5 = mem[0,0]
; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm6
; AVX-NEXT: vmovdqa %xmm7, %xmm12
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm6
; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpor %xmm0, %xmm6, %xmm6
; AVX-NEXT: vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
; AVX-NEXT: vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm2
; AVX-NEXT: vmovdqa %xmm1, %xmm14
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
; AVX-NEXT: vmovdqa %xmm1, %xmm13
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 176(%rdi), %xmm14
; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm2
; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 160(%rdi), %xmm13
; AVX-NEXT: vpshufb %xmm3, %xmm13, %xmm3
; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm3
; AVX-NEXT: vmovdqa %xmm1, %xmm10
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 208(%rdi), %xmm10
; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3
; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 192(%rdi), %xmm1
; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm4
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand Down Expand Up @@ -5277,11 +5269,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4
; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm5
; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
; AVX-NEXT: vmovdqa 144(%rdi), %xmm8
; AVX-NEXT: vpblendvb %xmm0, %xmm3, %xmm4, %xmm0
; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm0
; AVX-NEXT: vmovdqa %xmm1, %xmm8
; AVX-NEXT: vpshufb %xmm11, %xmm8, %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm7 = [0,0,2,7,12,128,128,128,0,0,2,7,12,128,128,128]
; AVX-NEXT: # xmm7 = mem[0,0]
; AVX-NEXT: vmovdqa 128(%rdi), %xmm13
Expand Down Expand Up @@ -5631,9 +5622,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5642,9 +5632,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5656,9 +5645,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5670,9 +5658,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 160(%rdi), %ymm13
; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
; AVX2-NEXT: vmovdqa %ymm1, %ymm6
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
Expand Down Expand Up @@ -5897,9 +5884,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5908,9 +5894,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5922,9 +5907,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -5936,9 +5920,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm13
; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm6
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
Expand Down Expand Up @@ -6163,9 +6146,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -6174,9 +6156,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -6188,9 +6169,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm5
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0]
; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm9, %ymm10, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1]
Expand All @@ -6202,9 +6182,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm6
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,65535,0,0,65535,0,65535,0,0,65535,0,65535,0,0,65535]
; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm13, %ymm14, %ymm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
Expand Down
172 changes: 73 additions & 99 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll

Large diffs are not rendered by default.

387 changes: 159 additions & 228 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll

Large diffs are not rendered by default.

476 changes: 193 additions & 283 deletions llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll

Large diffs are not rendered by default.

89 changes: 37 additions & 52 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1164,16 +1164,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE: # %bb.0:
; SSE-NEXT: movdqa 16(%rdi), %xmm6
; SSE-NEXT: movdqa 32(%rdi), %xmm4
; SSE-NEXT: movdqa 48(%rdi), %xmm0
; SSE-NEXT: movdqa 48(%rdi), %xmm9
; SSE-NEXT: movdqa 16(%rsi), %xmm7
; SSE-NEXT: movdqa 32(%rsi), %xmm8
; SSE-NEXT: movdqa 48(%rsi), %xmm11
; SSE-NEXT: movdqa 32(%rdx), %xmm10
; SSE-NEXT: movdqa 48(%rdx), %xmm12
; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
; SSE-NEXT: movdqa %xmm0, %xmm9
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,2,2]
; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE-NEXT: pand %xmm5, %xmm1
; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[3,3,3,3,4,5,6,7]
Expand Down Expand Up @@ -2080,13 +2079,12 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movdqa 32(%rdi), %xmm6
; SSE-NEXT: movdqa (%rsi), %xmm4
; SSE-NEXT: movdqa 16(%rsi), %xmm5
; SSE-NEXT: movdqa (%rdx), %xmm0
; SSE-NEXT: movdqa (%rdx), %xmm10
; SSE-NEXT: movdqa 16(%rdx), %xmm9
; SSE-NEXT: movdqa 32(%rdx), %xmm7
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE-NEXT: movdqa %xmm0, %xmm10
; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,65535,0,65535,65535]
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
Expand Down Expand Up @@ -2165,10 +2163,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdx), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
; SSE-NEXT: movdqa 48(%rdx), %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 48(%rdi), %xmm6
Expand All @@ -2195,10 +2192,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 64(%rdx), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 64(%rdx), %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 64(%rdi), %xmm6
Expand All @@ -2225,10 +2221,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 80(%rdx), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 80(%rdx), %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 80(%rdi), %xmm6
Expand All @@ -2255,10 +2250,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: por %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdx), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm7
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 96(%rdx), %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 96(%rdi), %xmm6
Expand All @@ -2283,10 +2277,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: pandn %xmm1, %xmm13
; SSE-NEXT: pand %xmm11, %xmm2
; SSE-NEXT: por %xmm2, %xmm13
; SSE-NEXT: movdqa 112(%rdx), %xmm2
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
; SSE-NEXT: movdqa %xmm2, %xmm6
; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 112(%rdx), %xmm6
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa %xmm0, %xmm2
; SSE-NEXT: pandn %xmm1, %xmm2
; SSE-NEXT: movdqa 112(%rdi), %xmm8
Expand Down Expand Up @@ -3097,13 +3090,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-LABEL: store_i16_stride3_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa %ymm1, %ymm6
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa (%rsi), %ymm1
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vmovdqa %ymm2, %ymm7
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
; AVX512-NEXT: vmovdqa (%rsi), %xmm5
Expand Down Expand Up @@ -3249,13 +3240,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-LABEL: store_i16_stride3_vf64:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm6
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm7
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
Expand Down Expand Up @@ -3401,13 +3390,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-LABEL: store_i16_stride3_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm6
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm7
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
Expand Down Expand Up @@ -3553,13 +3540,11 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-LABEL: store_i16_stride3_vf64:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm6
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm7
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm3
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
Expand Down
42 changes: 17 additions & 25 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,13 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-LABEL: store_i16_stride5_vf4:
; SSE: # %bb.0:
; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm6 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; SSE-NEXT: movdqa %xmm1, %xmm4
; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; SSE-NEXT: movdqa %xmm5, %xmm6
; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[3,3,3,3]
; SSE-NEXT: psrldq {{.*#+}} xmm6 = xmm6[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
Expand Down Expand Up @@ -2744,10 +2743,9 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1
; AVX-NEXT: vmovaps %ymm3, %ymm12
; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,0,65535,65535,65535,0,0,65535,65535,65535,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm3
; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3],xmm0[4,5,6,7]
Expand Down Expand Up @@ -6756,9 +6754,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512: # %bb.0:
; AVX512-NEXT: subq $488, %rsp # imm = 0x1E8
; AVX512-NEXT: vmovdqa 96(%rcx), %ymm11
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm0
; AVX512-NEXT: vmovdqa %ymm1, %ymm14
; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm0
; AVX512-NEXT: vmovdqa64 96(%rdx), %ymm17
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4]
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
Expand Down Expand Up @@ -7145,9 +7142,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm10
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5
Expand Down Expand Up @@ -7180,9 +7176,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm7
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8]
; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1
; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5]
Expand Down Expand Up @@ -7389,9 +7384,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $488, %rsp # imm = 0x1E8
; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm11
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm11, %ymm0
; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm14
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm0
; AVX512DQ-NEXT: vmovdqa64 96(%rdx), %ymm17
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
Expand Down Expand Up @@ -7778,9 +7772,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm10
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,8,9,10,11,4,5,4,5,6,7,12,13]
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4
; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm5
Expand Down Expand Up @@ -7813,9 +7806,8 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,2,2,2]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm7
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,1,0,1,8,9,8,8]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm1
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm20 = zmm2[0,1,0,1,4,5,4,5]
Expand Down
312 changes: 131 additions & 181 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll

Large diffs are not rendered by default.

816 changes: 345 additions & 471 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll

Large diffs are not rendered by default.

141 changes: 57 additions & 84 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll

Large diffs are not rendered by default.

91 changes: 39 additions & 52 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1202,14 +1202,13 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps 16(%rsi), %xmm13
; SSE-NEXT: movaps 32(%rsi), %xmm12
; SSE-NEXT: movaps 48(%rsi), %xmm9
; SSE-NEXT: movaps (%rdx), %xmm5
; SSE-NEXT: movaps 16(%rdx), %xmm6
; SSE-NEXT: movaps (%rdx), %xmm11
; SSE-NEXT: movaps 16(%rdx), %xmm14
; SSE-NEXT: movaps 32(%rdx), %xmm7
; SSE-NEXT: movaps 48(%rdx), %xmm8
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm5[0,3]
; SSE-NEXT: movaps %xmm5, %xmm11
; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm11[0,3]
; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm1, %xmm5
; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1]
; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm0[2,0]
Expand All @@ -1222,7 +1221,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm6[0,3]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm14[0,3]
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
Expand All @@ -1231,8 +1230,7 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm13[1]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3],xmm13[3,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm6[1,1]
; SSE-NEXT: movaps %xmm6, %xmm14
; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[1,1],xmm14[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm3, %xmm0
Expand Down Expand Up @@ -1264,11 +1262,10 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rdi), %xmm9
; SSE-NEXT: movaps 64(%rdx), %xmm1
; SSE-NEXT: movaps 64(%rdx), %xmm2
; SSE-NEXT: movaps %xmm9, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rsi), %xmm12
; SSE-NEXT: movaps %xmm9, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
Expand Down Expand Up @@ -2139,14 +2136,13 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: movaps 16(%rsi), %xmm11
; SSE-NEXT: movaps 32(%rsi), %xmm14
; SSE-NEXT: movaps 48(%rsi), %xmm3
; SSE-NEXT: movaps (%rdx), %xmm7
; SSE-NEXT: movaps (%rdx), %xmm12
; SSE-NEXT: movaps 16(%rdx), %xmm8
; SSE-NEXT: movaps 32(%rdx), %xmm9
; SSE-NEXT: movaps 48(%rdx), %xmm10
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm7[0,3]
; SSE-NEXT: movaps %xmm7, %xmm12
; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm12[0,3]
; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps %xmm2, %xmm7
; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,0]
Expand Down Expand Up @@ -2201,11 +2197,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rdi), %xmm2
; SSE-NEXT: movaps 64(%rdx), %xmm1
; SSE-NEXT: movaps 64(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 64(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2219,11 +2214,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 80(%rdi), %xmm2
; SSE-NEXT: movaps 80(%rdx), %xmm1
; SSE-NEXT: movaps 80(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 80(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2237,11 +2231,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 96(%rdi), %xmm2
; SSE-NEXT: movaps 96(%rdx), %xmm1
; SSE-NEXT: movaps 96(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 96(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2255,11 +2248,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 112(%rdi), %xmm2
; SSE-NEXT: movaps 112(%rdx), %xmm1
; SSE-NEXT: movaps 112(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 112(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2273,11 +2265,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 128(%rdi), %xmm2
; SSE-NEXT: movaps 128(%rdx), %xmm1
; SSE-NEXT: movaps 128(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 128(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2291,11 +2282,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 144(%rdi), %xmm2
; SSE-NEXT: movaps 144(%rdx), %xmm1
; SSE-NEXT: movaps 144(%rdx), %xmm4
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm4[0,3]
; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 144(%rsi), %xmm1
; SSE-NEXT: movaps %xmm2, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
Expand All @@ -2309,11 +2299,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 160(%rdi), %xmm14
; SSE-NEXT: movaps 160(%rdx), %xmm1
; SSE-NEXT: movaps 160(%rdx), %xmm3
; SSE-NEXT: movaps %xmm14, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm3
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 160(%rsi), %xmm1
; SSE-NEXT: movaps %xmm14, %xmm2
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
Expand All @@ -2326,11 +2315,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 176(%rdi), %xmm12
; SSE-NEXT: movaps 176(%rdx), %xmm1
; SSE-NEXT: movaps 176(%rdx), %xmm3
; SSE-NEXT: movaps %xmm12, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm3
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm3[0,3]
; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 176(%rsi), %xmm1
; SSE-NEXT: movaps %xmm12, %xmm2
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
Expand All @@ -2343,11 +2331,10 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2]
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 192(%rdi), %xmm13
; SSE-NEXT: movaps 192(%rdx), %xmm1
; SSE-NEXT: movaps 192(%rdx), %xmm2
; SSE-NEXT: movaps %xmm13, %xmm0
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[0,3]
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[0,3]
; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movaps 192(%rsi), %xmm11
; SSE-NEXT: movaps %xmm13, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
Expand Down
216 changes: 96 additions & 120 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll

Large diffs are not rendered by default.

113 changes: 49 additions & 64 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll

Large diffs are not rendered by default.

582 changes: 244 additions & 338 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll

Large diffs are not rendered by default.

362 changes: 146 additions & 216 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll

Large diffs are not rendered by default.

19 changes: 8 additions & 11 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7461,10 +7461,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 320(%rdi), %ymm1
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vmovaps %ymm1, %ymm10
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 320(%rdi), %ymm10
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2]
; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 320(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
Expand All @@ -7477,10 +7476,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 384(%rdi), %ymm1
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vmovaps %ymm1, %ymm9
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 384(%rdi), %ymm9
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2]
; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 384(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
Expand All @@ -7500,9 +7498,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 480(%rdi), %ymm1
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vmovapd %ymm1, %ymm13
; AVX-NEXT: vmovapd 480(%rdi), %ymm13
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2]
; AVX-NEXT: vmovapd 480(%rcx), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
Expand Down
288 changes: 136 additions & 152 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll

Large diffs are not rendered by default.

620 changes: 268 additions & 352 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll

Large diffs are not rendered by default.

90 changes: 38 additions & 52 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
Original file line number Diff line number Diff line change
Expand Up @@ -977,13 +977,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind {
; SSE-LABEL: store_i8_stride5_vf16:
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm10
; SSE-NEXT: movdqa (%rdi), %xmm12
; SSE-NEXT: movdqa (%rsi), %xmm8
; SSE-NEXT: movdqa (%rdx), %xmm2
; SSE-NEXT: movdqa (%rdx), %xmm9
; SSE-NEXT: movdqa (%rcx), %xmm4
; SSE-NEXT: movdqa (%r8), %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[3,3,3,3,4,5,6,7]
; SSE-NEXT: movdqa %xmm2, %xmm9
; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
; SSE-NEXT: pand %xmm6, %xmm1
Expand All @@ -996,9 +995,8 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm1, %xmm5
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
; SSE-NEXT: pand %xmm2, %xmm5
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
; SSE-NEXT: movdqa %xmm10, %xmm12
; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm12[1,1,2,2]
; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; SSE-NEXT: pand %xmm1, %xmm7
; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm8[0,1,2,1]
Expand Down Expand Up @@ -1653,7 +1651,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-LABEL: store_i8_stride5_vf32:
; SSE: # %bb.0:
; SSE-NEXT: subq $152, %rsp
; SSE-NEXT: movdqa 16(%rdi), %xmm3
; SSE-NEXT: movdqa 16(%rdi), %xmm15
; SSE-NEXT: movdqa (%rsi), %xmm9
; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; SSE-NEXT: movdqa 16(%rsi), %xmm7
Expand All @@ -1677,8 +1675,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; SSE-NEXT: por %xmm0, %xmm4
; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255]
; SSE-NEXT: pand %xmm8, %xmm4
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,2]
; SSE-NEXT: movdqa %xmm3, %xmm15
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,2,2]
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; SSE-NEXT: pand %xmm3, %xmm0
; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,1]
Expand Down Expand Up @@ -2004,9 +2001,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm2
; AVX-NEXT: vmovaps %ymm1, %ymm7
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm2
; AVX-NEXT: vmovdqa 16(%rcx), %xmm0
; AVX-NEXT: vmovdqa 16(%rdx), %xmm1
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
Expand Down Expand Up @@ -3783,29 +3779,25 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX: # %bb.0:
; AVX-NEXT: subq $104, %rsp
; AVX-NEXT: vmovdqa 48(%rcx), %xmm0
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128]
; AVX-NEXT: # xmm1 = mem[0,0]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm2
; AVX-NEXT: vmovdqa %xmm1, %xmm14
; AVX-NEXT: vmovddup {{.*#+}} xmm14 = [128,6,128,8,0,128,7,128,128,6,128,8,0,128,7,128]
; AVX-NEXT: # xmm14 = mem[0,0]
; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm2
; AVX-NEXT: vmovdqa 48(%rdx), %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm3
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,6,7,u,u,u,8,9,u,u,u,10,11,u,u,u]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm4
; AVX-NEXT: vmovaps %ymm5, %ymm2
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX-NEXT: vandnps %ymm3, %ymm2, %ymm4
; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
; AVX-NEXT: vmovddup {{.*#+}} xmm5 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0]
; AVX-NEXT: # xmm5 = mem[0,0]
; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX-NEXT: vmovdqa %xmm5, %xmm10
; AVX-NEXT: vmovddup {{.*#+}} xmm10 = [128,8,0,128,7,128,9,0,128,8,0,128,7,128,9,0]
; AVX-NEXT: # xmm10 = mem[0,0]
; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm6
; AVX-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX-NEXT: vmovddup {{.*#+}} xmm8 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
; AVX-NEXT: # xmm8 = mem[0,0]
; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm7
; AVX-NEXT: vmovdqa %xmm8, %xmm12
; AVX-NEXT: vmovddup {{.*#+}} xmm12 = [8,128,0,7,128,9,128,0,8,128,0,7,128,9,128,0]
; AVX-NEXT: # xmm12 = mem[0,0]
; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm7
; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,u,u,u,9,8,u,u,u,11,10,u,u,u,13,12]
Expand All @@ -3831,10 +3823,9 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,10,11,u,u,u,12,13,u,u,u,14,15,u]
; AVX-NEXT: vmovddup {{.*#+}} xmm6 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8]
; AVX-NEXT: # xmm6 = mem[0,0]
; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm6, %xmm15
; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [2,7,6,0,5,4,9,8,2,7,6,0,5,4,9,8]
; AVX-NEXT: # xmm15 = mem[0,0]
; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
; AVX-NEXT: vandnps %ymm4, %ymm11, %ymm1
Expand Down Expand Up @@ -3891,17 +3882,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovddup {{.*#+}} xmm13 = [0,1,4,5,0,2,3,6,0,1,4,5,0,2,3,6]
; AVX-NEXT: # xmm13 = mem[0,0]
; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm3
; AVX-NEXT: vmovddup {{.*#+}} xmm9 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
; AVX-NEXT: # xmm9 = mem[0,0]
; AVX-NEXT: vpshufb %xmm9, %xmm12, %xmm5
; AVX-NEXT: vmovddup {{.*#+}} xmm15 = [0,10,11,14,15,0,12,13,0,10,11,14,15,0,12,13]
; AVX-NEXT: # xmm15 = mem[0,0]
; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm5
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2
; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3
; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15]
; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX-NEXT: vmovdqa %xmm5, %xmm12
; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,128,5,6,7,8,128,10,11,12,13,128,15]
; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3
; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,128,128,128,128,1,128,128,128,128,2,128]
; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm5
; AVX-NEXT: vmovdqa %xmm10, %xmm7
Expand Down Expand Up @@ -3937,8 +3927,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovdqa 16(%rsi), %xmm8
; AVX-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm1
; AVX-NEXT: vmovdqa %xmm9, %xmm15
; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,4,5,u,u,u,6,7,u,u,u,8,9,u,u]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovdqa 16(%rcx), %xmm1
Expand Down Expand Up @@ -4107,7 +4096,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-LABEL: store_i8_stride5_vf64:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $248, %rsp
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm13
; AVX2-NEXT: vmovdqa (%rcx), %xmm1
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7
Expand Down Expand Up @@ -4161,9 +4150,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
; AVX2-NEXT: vpshufb %ymm15, %ymm4, %ymm1
; AVX2-NEXT: vmovdqa %ymm4, %ymm13
; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufb %ymm15, %ymm13, %ymm1
; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm3
Expand Down Expand Up @@ -4369,7 +4357,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2
; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm14
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
Expand All @@ -4383,9 +4371,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm3
; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm14, %ymm3
; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm11
Expand Down Expand Up @@ -4538,7 +4525,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: subq $168, %rsp
; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm9
; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm15
; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
Expand Down Expand Up @@ -4592,9 +4579,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm1
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1]
; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm2
; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15
; AVX2-FCP-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill
; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
Expand Down
144 changes: 60 additions & 84 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll

Large diffs are not rendered by default.

256 changes: 107 additions & 149 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll

Large diffs are not rendered by default.

102 changes: 41 additions & 61 deletions llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3240,19 +3240,16 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7]
; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm8, %ymm8
; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm14
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm13
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX2-FP-NEXT: vpshufb %ymm13, %ymm9, %ymm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9
; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854]
; AVX2-FP-NEXT: vpshufb %ymm11, %ymm9, %ymm9
; AVX2-FP-NEXT: vmovdqa %ymm11, %ymm12
; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854]
; AVX2-FP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
Expand Down Expand Up @@ -3393,19 +3390,16 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7,8],ymm14[9],ymm13[10,11,12],ymm14[13],ymm13[14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2],ymm11[3],ymm13[4],ymm11[5],ymm13[6],ymm11[7]
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8
; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm14
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm13
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,8,9,8,9,8,9,8,9,12,13,10,11,0,1,2,3,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7],ymm9[8,9,10],ymm8[11],ymm9[12,13,14],ymm8[15]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm12, %ymm9
; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854]
; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm9
; AVX2-FCP-NEXT: vmovdqa %ymm11, %ymm12
; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} ymm12 = [2312,2826,3340,3854]
; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
Expand Down Expand Up @@ -3774,9 +3768,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm13
; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2
; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm3 = [2312,2826,3340,3854]
; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm14
; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} ymm14 = [2312,2826,3340,3854]
; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm3
; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28
Expand Down Expand Up @@ -4134,16 +4127,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm3
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} ymm11 = [2312,2826,3340,3854]
; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm4 = [1284,1798]
; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm8
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} xmm8 = [1284,1798]
; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm3
; AVX512DQ-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
Expand Down Expand Up @@ -5657,11 +5648,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,5,7,7]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm11, %ymm8
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
; AVX-NEXT: vbroadcastsd {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9
; AVX-NEXT: vbroadcastsd {{.*#+}} ymm12 = [65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0]
; AVX-NEXT: vandnps %ymm9, %ymm12, %ymm9
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
; AVX-NEXT: vandps %ymm11, %ymm8, %ymm8
; AVX-NEXT: vmovaps %ymm11, %ymm12
; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8
; AVX-NEXT: vorps %ymm9, %ymm8, %ymm8
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm14[0,1,2,3,4,4,6,5]
; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7]
Expand Down Expand Up @@ -6403,16 +6393,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm9
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm10
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm13
; AVX2-FP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798]
; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm1
; AVX2-FP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
Expand Down Expand Up @@ -6727,16 +6714,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7,8],ymm6[9],ymm5[10,11,12],ymm6[13],ymm5[14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4],ymm3[5],ymm5[6],ymm3[7]
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm10
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,0,1,0,1,8,9,10,11,4,5,2,3,0,1,4,5,0,1,4,5,8,9,10,11,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm3 = [1284,1798]
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm3, %xmm13
; AVX2-FCP-NEXT: vpmovsxwq {{.*#+}} xmm13 = [1284,1798]
; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm1
; AVX2-FCP-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
Expand Down Expand Up @@ -7530,12 +7514,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21
; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
Expand Down Expand Up @@ -8306,12 +8288,10 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm12
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,8,9,10,11,8,9,10,11,10,11,14,15,0,1,2,3,12,13,10,11,12,13,10,11,14,15,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm21
; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm2, %ymm2
Expand Down
65 changes: 28 additions & 37 deletions llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6443,10 +6443,9 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kshiftrd $2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k3
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftrd $2, %k5, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k3
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
Expand Down Expand Up @@ -6665,8 +6664,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k4, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $21, %k1, %k2
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $21, %k7, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k2, %k3
Expand All @@ -6683,8 +6682,7 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrd $22, %k1, %k4
; AVX512BW-NEXT: kmovq %k1, %k7
; AVX512BW-NEXT: kshiftrd $22, %k7, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
Expand Down Expand Up @@ -7053,12 +7051,11 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k2, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k5, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k2, %k4
; AVX512BW-NEXT: kmovq %k2, %k5
; AVX512BW-NEXT: kshiftrw $2, %k5, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
Expand Down Expand Up @@ -7879,8 +7876,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $16, %k7, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
Expand All @@ -7904,8 +7901,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $17, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftrq $17, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
Expand Down Expand Up @@ -8265,8 +8261,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $32, %k2, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $32, %k7, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
Expand All @@ -8290,8 +8286,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $33, %k2, %k1
; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kshiftrq $33, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
Expand Down Expand Up @@ -8651,8 +8646,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $48, %k7, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
Expand All @@ -8675,8 +8670,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftrq $49, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
Expand Down Expand Up @@ -9905,17 +9899,16 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k0, %k6, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $4, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k5, %k6
; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
Expand Down Expand Up @@ -9990,12 +9983,11 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $23, %k2, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $23, %k6, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k3
; AVX512BW-NEXT: kshiftrd $22, %k2, %k5
; AVX512BW-NEXT: kshiftrd $22, %k6, %k5
; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovq %k2, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k2
; AVX512BW-NEXT: kshiftrw $14, %k3, %k5
Expand Down Expand Up @@ -10173,13 +10165,12 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4096,10 +4096,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT: andb $28, %cl
; FALLBACK20-NEXT: movzbl %cl, %ecx
; FALLBACK20-NEXT: movl 32(%esp,%ecx), %esi
; FALLBACK20-NEXT: movl 36(%esp,%ecx), %ebx
; FALLBACK20-NEXT: movl %ecx, %edi
; FALLBACK20-NEXT: movzbl %cl, %edi
; FALLBACK20-NEXT: movl 32(%esp,%edi), %esi
; FALLBACK20-NEXT: movl 36(%esp,%edi), %ebx
; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT: movl %eax, %ecx
; FALLBACK20-NEXT: shrl %cl, %esi
Expand Down Expand Up @@ -4423,10 +4422,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT: andb $28, %cl
; FALLBACK24-NEXT: movzbl %cl, %ecx
; FALLBACK24-NEXT: movl 32(%esp,%ecx), %esi
; FALLBACK24-NEXT: movl 36(%esp,%ecx), %ebx
; FALLBACK24-NEXT: movl %ecx, %edi
; FALLBACK24-NEXT: movzbl %cl, %edi
; FALLBACK24-NEXT: movl 32(%esp,%edi), %esi
; FALLBACK24-NEXT: movl 36(%esp,%edi), %ebx
; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT: movl %eax, %ecx
; FALLBACK24-NEXT: shrl %cl, %esi
Expand Down Expand Up @@ -4745,10 +4743,9 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK28-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT: andb $28, %cl
; FALLBACK28-NEXT: movzbl %cl, %ecx
; FALLBACK28-NEXT: movl 32(%esp,%ecx), %esi
; FALLBACK28-NEXT: movl 36(%esp,%ecx), %ebx
; FALLBACK28-NEXT: movl %ecx, %edi
; FALLBACK28-NEXT: movzbl %cl, %edi
; FALLBACK28-NEXT: movl 32(%esp,%edi), %esi
; FALLBACK28-NEXT: movl 36(%esp,%edi), %ebx
; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT: movl %eax, %ecx
; FALLBACK28-NEXT: shrl %cl, %esi
Expand Down Expand Up @@ -6922,15 +6919,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; FALLBACK20-NEXT: andb $28, %cl
; FALLBACK20-NEXT: negb %cl
; FALLBACK20-NEXT: movsbl %cl, %eax
; FALLBACK20-NEXT: movl 84(%esp,%eax), %edi
; FALLBACK20-NEXT: movsbl %cl, %ebx
; FALLBACK20-NEXT: movl 84(%esp,%ebx), %edi
; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK20-NEXT: movb %dh, %cl
; FALLBACK20-NEXT: shll %cl, %edi
; FALLBACK20-NEXT: movb %dh, %dl
; FALLBACK20-NEXT: notb %dl
; FALLBACK20-NEXT: movl 80(%esp,%eax), %esi
; FALLBACK20-NEXT: movl %eax, %ebx
; FALLBACK20-NEXT: movl 80(%esp,%ebx), %esi
; FALLBACK20-NEXT: movl %esi, %eax
; FALLBACK20-NEXT: shrl %eax
; FALLBACK20-NEXT: movl %edx, %ecx
Expand Down Expand Up @@ -7250,15 +7246,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK24-NEXT: andb $28, %cl
; FALLBACK24-NEXT: negb %cl
; FALLBACK24-NEXT: movsbl %cl, %eax
; FALLBACK24-NEXT: movl 84(%esp,%eax), %edi
; FALLBACK24-NEXT: movsbl %cl, %ebx
; FALLBACK24-NEXT: movl 84(%esp,%ebx), %edi
; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK24-NEXT: movb %dh, %cl
; FALLBACK24-NEXT: shll %cl, %edi
; FALLBACK24-NEXT: movb %dh, %dl
; FALLBACK24-NEXT: notb %dl
; FALLBACK24-NEXT: movl 80(%esp,%eax), %esi
; FALLBACK24-NEXT: movl %eax, %ebx
; FALLBACK24-NEXT: movl 80(%esp,%ebx), %esi
; FALLBACK24-NEXT: movl %esi, %eax
; FALLBACK24-NEXT: shrl %eax
; FALLBACK24-NEXT: movl %edx, %ecx
Expand Down Expand Up @@ -7573,15 +7568,14 @@ define void @shl_32bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
; FALLBACK28-NEXT: andb $28, %cl
; FALLBACK28-NEXT: negb %cl
; FALLBACK28-NEXT: movsbl %cl, %eax
; FALLBACK28-NEXT: movl 84(%esp,%eax), %edi
; FALLBACK28-NEXT: movsbl %cl, %ebx
; FALLBACK28-NEXT: movl 84(%esp,%ebx), %edi
; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; FALLBACK28-NEXT: movb %dh, %cl
; FALLBACK28-NEXT: shll %cl, %edi
; FALLBACK28-NEXT: movb %dh, %dl
; FALLBACK28-NEXT: notb %dl
; FALLBACK28-NEXT: movl 80(%esp,%eax), %esi
; FALLBACK28-NEXT: movl %eax, %ebx
; FALLBACK28-NEXT: movl 80(%esp,%ebx), %esi
; FALLBACK28-NEXT: movl %esi, %eax
; FALLBACK28-NEXT: shrl %eax
; FALLBACK28-NEXT: movl %edx, %ecx
Expand Down
37 changes: 16 additions & 21 deletions llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1581,11 +1581,10 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi,4), %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
Expand Down Expand Up @@ -2141,16 +2140,15 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: andb $28, %al
; X86-NO-BMI2-NO-SHLD-NEXT: negb %al
; X86-NO-BMI2-NO-SHLD-NEXT: movsbl %al, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebx), %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp
Expand Down Expand Up @@ -2346,12 +2344,11 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $28, %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: negb %cl
; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edx), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movsbl %cl, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edx), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %esi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %ecx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ecx, %edi
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx
; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
Expand Down Expand Up @@ -2716,10 +2713,9 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %al
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%eax,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%eax,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %al, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%ebp,4), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%ebp,4), %edx
; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, %ch
Expand Down Expand Up @@ -4636,13 +4632,12 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebx), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl
; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebx), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp
; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi
; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx
; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3270,11 +3270,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
; X86-SHLD-NEXT: movl %ecx, %edi
; X86-SHLD-NEXT: andl $60, %edi
; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx
; X86-SHLD-NEXT: movl 20(%esp,%edi), %eax
; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SHLD-NEXT: movl 20(%esp,%edi), %esi
; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SHLD-NEXT: shll $3, %ecx
; X86-SHLD-NEXT: andl $24, %ecx
; X86-SHLD-NEXT: movl %eax, %esi
; X86-SHLD-NEXT: movl %edx, %eax
; X86-SHLD-NEXT: shrdl %cl, %edx, %esi
; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
Expand Down