diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 144c81b3ebebf7..b435f13632c79e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35797,6 +35797,19 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, (RootVT.isFloatingPoint() && Depth >= 1) || (RootVT.is256BitVector() && !Subtarget.hasAVX2()); + // How many elements does each of the inputs have, given the current + // granularity of the root shuffle? Note that while currently the sizes of an + // inputs must match the size of the shuffle root, + // that restriction will be lifted in the future. + SmallVector InputNumElts; + llvm::transform(std::initializer_list({VT1, VT2}), + std::back_inserter(InputNumElts), + [BaseMaskEltSizeInBits](MVT VT) { + assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 && + "Input is not a multiple of output element width?"); + return VT.getSizeInBits() / BaseMaskEltSizeInBits; + }); + // Don't combine if we are a AVX512/EVEX target and the mask element size // is different from the root element size - this would prevent writemasks // from being reused. @@ -35811,12 +35824,38 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If we are shuffling a broadcast (and not introducing zeros) then // we can just use the broadcast directly. This works for smaller broadcast // elements as well as they already repeat across each mask element - if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) && - (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && + SmallVector InputIsSplat; + llvm::transform( + std::initializer_list({V1, V2}), + std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) { + return isTargetShuffleSplat(V) && + (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0; + }); + if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) && V1.getValueSizeInBits() >= RootSizeInBits) { return CanonicalizeShuffleInput(RootVT, V1); } + // Adjust mask elements that pick from a splat input to be identity mask elts, + // i.e. to pick from the same lane of the input as the mask element is in. + // This may allow to simplify the shuffle into a blend. + SmallVector NewMask; + if (InputIsSplat[0] || InputIsSplat[1]) { + NewMask.assign(BaseMask.begin(), BaseMask.end()); + for (unsigned i = 0; i != NumBaseMaskElts; ++i) { + int &M = NewMask[i]; + assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) && + "OOB mask element?"); + if (M < 0) + continue; // Keep the undef/zero mask elements as-is. + int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1; + // Is the used input wide-enough to contain that lane, and is it a splat? + if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx]) + M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input. + } + BaseMask = std::move(NewMask); + } + // See if the shuffle is a hidden identity shuffle - repeated args in HOPs // etc. can be simplified. if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) { diff --git a/llvm/test/CodeGen/X86/avx.ll b/llvm/test/CodeGen/X86/avx.ll index a176edba13aa96..b542f173ae9826 100644 --- a/llvm/test/CodeGen/X86/avx.ll +++ b/llvm/test/CodeGen/X86/avx.ll @@ -153,11 +153,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] +; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] -; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0] +; X32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3] +; X32-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3] ; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X32-NEXT: retl @@ -165,11 +165,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## %bb.0: ; X64-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] +; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; X64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0] -; X64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0] +; X64-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3] +; X64-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3] ; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index 71682094d64e9f..a763f92b764094 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -4315,7 +4315,7 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,4,2,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [3,5,2,7] ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm3 ; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 ; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 @@ -4340,7 +4340,7 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x doub ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK-FAST: # %bb.0: ; CHECK-FAST-NEXT: vmovapd (%rdi), %ymm2 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,6] +; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [3,5,2,7] ; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 ; CHECK-FAST-NEXT: vpermi2pd 32(%rdi){1to4}, %ymm2, %ymm1 {%k1} {z} diff --git a/llvm/test/CodeGen/X86/pr15296.ll b/llvm/test/CodeGen/X86/pr15296.ll index 71034f696429c9..f9575571300087 100644 --- a/llvm/test/CodeGen/X86/pr15296.ll +++ b/llvm/test/CodeGen/X86/pr15296.ll @@ -26,28 +26,11 @@ allocas: define <8 x i32> @shiftInput___canonical(<8 x i32> %input, i32 %shiftval, <8 x i32> %__mask) nounwind { ; CHECK-LABEL: shiftInput___canonical: ; CHECK: # %bb.0: # %allocas -; CHECK-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpsrld %xmm2, %xmm3, %xmm4 -; CHECK-NEXT: vpsrlq $32, %xmm1, %xmm5 -; CHECK-NEXT: vpsrld %xmm5, %xmm3, %xmm6 -; CHECK-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; CHECK-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1],xmm6[2,3,4,5,6,7] -; CHECK-NEXT: vpsrld %xmm6, %xmm3, %xmm7 -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; CHECK-NEXT: vpsrld %xmm1, %xmm3, %xmm3 -; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; CHECK-NEXT: vpsrld %xmm5, %xmm0, %xmm4 -; CHECK-NEXT: vpsrld %xmm6, %xmm0, %xmm5 -; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; CHECK-NEXT: vpsrld %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpsrld %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retl allocas: %smear.0 = insertelement <8 x i32> undef, i32 %shiftval, i32 0 diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 17aae3373c5ec5..1a1b976c45403a 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1661,15 +1661,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-AVX1-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X86-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X86-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] ; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X86-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] +; X86-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] +; X86-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] +; X86-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] ; X86-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] ; X86-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -1679,16 +1679,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08] ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04] ; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0x81] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X86-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] +; X86-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] +; X86-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] +; X86-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] ; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X86-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X86-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] -; X86-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X86-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] ; X86-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; @@ -1712,15 +1712,15 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-AVX1-LABEL: insertps_from_broadcast_multiple_use: ; X64-AVX1: ## %bb.0: ; X64-AVX1-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X64-AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X64-AVX1-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] ; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X64-AVX1-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm2, %xmm1 ## encoding: [0xc4,0xe3,0x69,0x0c,0xcc,0x08] +; X64-AVX1-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[3] +; X64-AVX1-NEXT: vblendps $8, %xmm4, %xmm3, %xmm2 ## encoding: [0xc4,0xe3,0x61,0x0c,0xd4,0x08] +; X64-AVX1-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[3] ; X64-AVX1-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x58,0xca] ; X64-AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x58,0xc1] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] @@ -1728,16 +1728,16 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-AVX512-LABEL: insertps_from_broadcast_multiple_use: ; X64-AVX512: ## %bb.0: ; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm4 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x24,0xb7] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc4,0x30] -; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xcc,0x30] -; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[0] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc4,0x08] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x0c,0xcc,0x08] +; X64-AVX512-NEXT: ## xmm1 = xmm1[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x0c,0xd4,0x08] +; X64-AVX512-NEXT: ## xmm2 = xmm2[0,1,2],xmm4[3] +; X64-AVX512-NEXT: vblendps $8, %xmm4, %xmm3, %xmm3 ## encoding: [0xc4,0xe3,0x61,0x0c,0xdc,0x08] +; X64-AVX512-NEXT: ## xmm3 = xmm3[0,1,2],xmm4[3] ; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xcc,0x30] -; X64-AVX512-NEXT: ## xmm1 = xmm2[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vinsertps $48, %xmm4, %xmm3, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd4,0x30] -; X64-AVX512-NEXT: ## xmm2 = xmm3[0,1,2],xmm4[0] -; X64-AVX512-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X64-AVX512-NEXT: vaddps %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xcb] ; X64-AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %1 = getelementptr inbounds float, float* %fb, i64 %index diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index d280580f55f1b8..96bcaa1a0d443a 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4591,14 +4591,14 @@ define <32 x i8> @shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,15,15,15,15,15,15,15,u,u,u,u,u,u,u,u] -; AVX512VLBW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VLBW-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_15_15_15_15_15_15_15_15_32_32_32_32_32_32_32_32_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu: