diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 0af3cacb22813..ecf151ffeb664 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14031,6 +14031,12 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } + } else if (NumV2Elements == 3) { + // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but + // we can get here due to other paths (e.g repeated mask matching) that we + // don't want to do another round of lowerVECTOR_SHUFFLE. + ShuffleVectorSDNode::commuteMask(NewMask); + return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll index ccf1476e6a657..422f64d982bfb 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -596,6 +596,21 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(<2 x i32>* %src, <8 x i32>* %d ret void } +define <32 x float> @PR47534(<8 x float> %tmp) { +; CHECK-LABEL: PR47534: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [7,25,26,27,7,29,30,31,7,25,26,27,7,29,30,31] +; CHECK-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] +; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: ret{{[l|q]}} + %tmp1 = shufflevector <8 x float> %tmp, <8 x float> undef, <32 x i32> + %tmp2 = shufflevector <32 x float> , <32 x float> undef, <32 x i32> + %tmp18 = shufflevector <32 x float> %tmp2, <32 x float> %tmp1, <32 x i32> + ret <32 x float> %tmp18 +} + %union1= type { <16 x float> } @src1 = external dso_local local_unnamed_addr global %union1, align 64