diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 2399936ffd827..24df848f87b9b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9830,6 +9830,17 @@ static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, return (Idx % NumMemElts) == (ExpectedIdx % NumMemElts); } break; + case X86ISD::VPERMI: { + if (Op == ExpectedOp && (int)VT.getVectorNumElements() == MaskSize) { + SmallVector Mask; + DecodeVPERMMask(MaskSize, Op.getConstantOperandVal(1), Mask); + SDValue Src = Op.getOperand(0); + return (Mask[Idx] == Mask[ExpectedIdx]) || + IsElementEquivalent(MaskSize, Src, Src, Mask[Idx], + Mask[ExpectedIdx]); + } + break; + } case X86ISD::HADD: case X86ISD::HSUB: case X86ISD::FHADD: diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 1fada58f05ba9..7d2915ddc75b1 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1952,7 +1952,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1965,7 +1965,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -1991,7 +1991,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -2016,7 +2016,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4209,14 +4209,13 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm2 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 7fcca526e460c..f5802150d5353 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1569,7 +1569,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512F-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -1578,7 +1578,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -1587,7 +1587,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll index d9393ba9febb2..edd3933fcfc28 100644 --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -342,7 +342,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_binary(< ; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_3_binary: ; CHECK: # %bb.0: ; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; CHECK-NEXT: retq %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> ret <4 x i64> %r @@ -597,8 +597,8 @@ define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_unary(<8 define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary(<8 x i32> %x, <8 x i32> %y) nounwind { ; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary: ; CHECK-SLOW: # %bb.0: -; CHECK-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-SLOW-NEXT: vbroadcastss %xmm1, %ymm1 ; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; CHECK-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 7fbb211b69ccf..bc83cc1cab42d 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1952,7 +1952,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1965,7 +1965,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512F-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512F-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -1991,7 +1991,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX512DQ-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512DQ-SLOW-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx) @@ -2016,7 +2016,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 +; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 ; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx) @@ -4649,11 +4649,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -4669,7 +4668,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] @@ -4687,7 +4686,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in. ; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastq %xmm0, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7] @@ -6582,7 +6581,7 @@ define void @vec512_i32_widen_to_i64_factor2_broadcast_to_v8i64_factor8(ptr %in. ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm1 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 45d589b6c988e..266b06a23df94 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1569,7 +1569,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512F-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512F-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx) @@ -1578,7 +1578,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512DQ-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512DQ-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) @@ -1587,7 +1587,7 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in. ; ; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastq (%rdi), %ymm0 +; AVX512BW-NEXT: vpbroadcastd (%rdi), %ymm0 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7] ; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)