45 changes: 38 additions & 7 deletions llvm/test/CodeGen/X86/widen_fsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,44 @@ define void @widen_fsub_v2f32_v8f32(ptr %a0, ptr %b0, ptr %c0) {
; SSE-NEXT: movlps %xmm3, 24(%rdx)
; SSE-NEXT: retq
;
; AVX-LABEL: widen_fsub_v2f32_v8f32:
; AVX: # %bb.0:
; AVX-NEXT: vmovups (%rdi), %ymm0
; AVX-NEXT: vsubps (%rsi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, (%rdx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX1OR2-LABEL: widen_fsub_v2f32_v8f32:
; AVX1OR2: # %bb.0:
; AVX1OR2-NEXT: vmovups (%rdi), %ymm0
; AVX1OR2-NEXT: vsubps (%rsi), %ymm0, %ymm0
; AVX1OR2-NEXT: vmovups %ymm0, (%rdx)
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
; AVX512F-LABEL: widen_fsub_v2f32_v8f32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovups (%rdi), %ymm0
; AVX512F-NEXT: vsubps (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vmovups %ymm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: widen_fsub_v2f32_v8f32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
; AVX512VL-NEXT: vsubps %xmm5, %xmm1, %xmm1
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
; AVX512VL-NEXT: vsubps %xmm6, %xmm3, %xmm3
; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX512VL-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX512VL-NEXT: vsubps %ymm3, %ymm0, %ymm0
; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX512VL-NEXT: vmovups %ymm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6123,13 +6123,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512F-FAST: # %bb.0:
; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,u,9,u,0,u,1,u>
; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0
; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
Expand Down Expand Up @@ -6159,13 +6159,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
; AVX512BW-FAST-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,u,9,u,0,u,1,u>
; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u]
; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7]
; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
Expand Down
112 changes: 54 additions & 58 deletions llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2031,9 +2031,9 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-FAST-NEXT: vpermt2d %zmm0, %zmm1, %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
Expand Down Expand Up @@ -2575,10 +2575,10 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[0,1]
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
Expand Down Expand Up @@ -2870,10 +2870,10 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[0,1]
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
Expand Down Expand Up @@ -3164,10 +3164,10 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[0,1]
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
; AVX2-NEXT: vpbroadcastb %xmm1, %xmm2
; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
Expand Down Expand Up @@ -3774,18 +3774,17 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14],ymm0[15]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
Expand Down Expand Up @@ -3814,18 +3813,17 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14],ymm0[15]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13,14],ymm0[15]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
Expand Down Expand Up @@ -4209,18 +4207,17 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512F-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4],ymm3[5,6,7,8,9,10,11],ymm0[12],ymm3[13,14,15]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
Expand Down Expand Up @@ -4249,18 +4246,17 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4],ymm3[5,6,7,8,9,10,11],ymm0[12],ymm3[13,14,15]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7,8,9,10,11],ymm0[12],ymm2[13,14,15]
; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2038,7 +2038,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm2[0,1]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero,xmm1[0],zero,zero
Expand Down Expand Up @@ -2290,7 +2290,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm2[0,1]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero
Expand Down Expand Up @@ -2542,7 +2542,7 @@ define void @vec384_i8_widen_to_i96_factor12_broadcast_to_v4i96_factor4(ptr %in.
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm2[0,1]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm0[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255]
; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
Expand Down