diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 895a02e5c98e20..a293c48a824ae4 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -36610,6 +36610,17 @@ static SDValue combineX86ShufflesRecursively( } } + // Attempt to constant fold all of the constant source ops. + if (SDValue Cst = combineX86ShufflesConstants( + Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) + return Cst; + + // Canonicalize the combined shuffle mask chain with horizontal ops. + // NOTE: This will update the Ops and Mask. + if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( + Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) + return DAG.getBitcast(Root.getValueType(), HOp); + // Widen any subvector shuffle inputs we've collected. if (any_of(Ops, [RootSizeInBits](SDValue Op) { return Op.getValueSizeInBits() < RootSizeInBits; @@ -36622,17 +36633,6 @@ static SDValue combineX86ShufflesRecursively( resolveTargetShuffleInputsAndMask(Ops, Mask); } - // Attempt to constant fold all of the constant source ops. - if (SDValue Cst = combineX86ShufflesConstants( - Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) - return Cst; - - // Canonicalize the combined shuffle mask chain with horizontal ops. - // NOTE: This will update the Ops and Mask. - if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( - Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) - return DAG.getBitcast(Root.getValueType(), HOp); - // We can only combine unary and binary shuffle mask cases. if (Ops.size() <= 2) { // Minor canonicalization of the accumulated shuffle mask to make it easier diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 2c53579f762725..c358250305a7ca 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -108,13 +108,12 @@ define void @PR46178(i16* %0) { ; X86-NEXT: vmovdqu (%eax), %ymm1 ; X86-NEXT: vpmovqw %ymm0, %xmm0 ; X86-NEXT: vpmovqw %ymm1, %xmm1 -; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; X86-NEXT: vpsllw $8, %ymm0, %ymm0 -; X86-NEXT: vpsraw $8, %ymm0, %ymm0 -; X86-NEXT: vmovapd {{.*#+}} ymm1 = [0,0,2,0,4,0,4,0] -; X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 -; X86-NEXT: vmovupd %ymm1, (%eax) +; X86-NEXT: vpsllw $8, %xmm1, %xmm1 +; X86-NEXT: vpsraw $8, %xmm1, %xmm1 +; X86-NEXT: vpsllw $8, %xmm0, %xmm0 +; X86-NEXT: vpsraw $8, %xmm0, %xmm0 +; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[3] +; X86-NEXT: vmovupd %ymm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ;