diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c1588b0b7b5df..4207d23655390 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9978,12 +9978,17 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts && "Illegal variable permute mask size"); if (IndicesVec.getValueType().getVectorNumElements() > NumElts) { - if (IndicesVec.getValueSizeInBits() == SizeInBits) - IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec), - IndicesVT, IndicesVec); - else + // Narrow/widen the indices vector to the correct size. + if (IndicesVec.getValueSizeInBits() > SizeInBits) IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), NumElts * VT.getScalarSizeInBits()); + else if (IndicesVec.getValueSizeInBits() < SizeInBits) + IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG, + SDLoc(IndicesVec), SizeInBits); + // Zero-extend the index elements within the vector. + if (IndicesVec.getValueType().getVectorNumElements() > NumElts) + IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec), + IndicesVT, IndicesVec); } IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index 9902a351b8bf4..3327704c04a08 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -1138,6 +1138,10 @@ entry: ret <4 x i32> %tmp12 } +; +; PR50356 - correctly adjust the indices vector to match the source/destination size. +; + define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr nounwind { ; XOP-LABEL: PR50356: ; XOP: # %bb.0: @@ -1255,3 +1259,76 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr %v37 = select <4 x i1> %v36, <4 x i64> , <4 x i64> ; 17 68 102 136 ret <4 x i64> %v37 } + +define <4 x i64> @var_shuffle_v4i64_with_v16i8_indices(<4 x i64> %v, <16 x i8> %indices) unnamed_addr nounwind { +; XOP-LABEL: var_shuffle_v4i64_with_v16i8_indices: +; XOP: # %bb.0: +; XOP-NEXT: vpsrld $16, %xmm1, %xmm2 +; XOP-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; XOP-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; XOP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; XOP-NEXT: vpaddq %xmm2, %xmm2, %xmm2 +; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm3, %ymm0, %ymm0 +; XOP-NEXT: retq +; +; AVX1-LABEL: var_shuffle_v4i64_with_v16i8_indices: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrld $16, %xmm1, %xmm2 +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 +; AVX1-NEXT: vpermilpd %ymm4, %ymm3, %ymm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq {{\.LCPI[0-9]+_[0-9]+}}+{{.*}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: var_shuffle_v4i64_with_v16i8_indices: +; AVX512: # %bb.0: +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512-NEXT: retq +; +; AVX512VL-LABEL: var_shuffle_v4i64_with_v16i8_indices: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpermq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq + %index0 = extractelement <16 x i8> %indices, i32 0 + %index1 = extractelement <16 x i8> %indices, i32 1 + %index2 = extractelement <16 x i8> %indices, i32 2 + %index3 = extractelement <16 x i8> %indices, i32 3 + %v0 = extractelement <4 x i64> %v, i8 %index0 + %v1 = extractelement <4 x i64> %v, i8 %index1 + %v2 = extractelement <4 x i64> %v, i8 %index2 + %v3 = extractelement <4 x i64> %v, i8 %index3 + %ret0 = insertelement <4 x i64> undef, i64 %v0, i32 0 + %ret1 = insertelement <4 x i64> %ret0, i64 %v1, i32 1 + %ret2 = insertelement <4 x i64> %ret1, i64 %v2, i32 2 + %ret3 = insertelement <4 x i64> %ret2, i64 %v3, i32 3 + ret <4 x i64> %ret3 +}