diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index bea4d58e758ec6..d5d41ca50553db 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned MaxScale = 64 / EltSizeInBits; for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { + unsigned SrcEltBits = EltSizeInBits * Scale; unsigned NumSrcElts = NumElts / Scale; unsigned UpperElts = NumElts - NumSrcElts; if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) continue; + // Attempt to find a matching source truncation, but as a fall back VLX + // cases can use the VPMOV directly. SDValue Src = peekThroughBitcasts(V1); - if (Src.getOpcode() != ISD::TRUNCATE || - Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale)) + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getScalarValueSizeInBits() == SrcEltBits) { + Src = Src.getOperand(0); + } else if (Subtarget.hasVLX()) { + MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); + MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); + Src = DAG.getBitcast(SrcVT, Src); + // Don't do this if PACKSS/PACKUS could perform it cheaper. + if (Scale == 2 && + ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) || + (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits))) + return SDValue(); + } else return SDValue(); - Src = Src.getOperand(0); // VPMOVWB is only available with avx512bw. - MVT SrcVT = Src.getSimpleValueType(); - if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 && - !Subtarget.hasBWI()) + if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32) return SDValue(); bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll index 096ce90742a67a..c41c8ca835cf7e 100644 --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { ; ; SKX-LABEL: trunc_qw_128: ; SKX: ## %bb.0: -; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; SKX-NEXT: vpmovqw %xmm0, %xmm0 ; SKX-NEXT: retq %x = trunc <2 x i64> %i to <2 x i16> ret <2 x i16> %x diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll index 60364e638d8a8c..54e62066d2eeae 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll @@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) { define <2 x half> @test_u1tofp2(<2 x i1> %arg0) { ; CHECK-LABEL: test_u1tofp2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; CHECK-NEXT: vpmovqw %xmm0, %xmm0 ; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll index 3ac5ce820053bf..18dbb3ffec159a 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll @@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind { ; AVX512-LABEL: vf4: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpmovdw %xmm0, %xmm1 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm1, (%rsi) -; AVX512-NEXT: vmovq %xmm0, (%rdx) +; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u] +; AVX512-NEXT: vpmovdw %xmm0, (%rsi) +; AVX512-NEXT: vmovq %xmm1, (%rdx) ; AVX512-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 32 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll index e16668a96bd171..d06ab7caec3b66 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll @@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr % ; AVX512-LABEL: vf2: ; AVX512: # %bb.0: ; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpmovqw %xmm0, %xmm1 -; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7] -; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] -; AVX512-NEXT: vmovd %xmm1, (%rsi) -; AVX512-NEXT: vmovd %xmm2, (%rdx) +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] +; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7] +; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] +; AVX512-NEXT: vpmovqw %xmm0, (%rsi) +; AVX512-NEXT: vmovd %xmm1, (%rdx) ; AVX512-NEXT: vmovd %xmm3, (%rcx) -; AVX512-NEXT: vmovd %xmm0, (%r8) +; AVX512-NEXT: vmovd %xmm2, (%r8) ; AVX512-NEXT: retq %wide.vec = load <8 x i16>, ptr %in.vec, align 32 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 8ed4d83fcf8d81..0cd027aa051547 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: retq ; -; AVX512-LABEL: rot16_trunc: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1 -; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: retq +; AVX512NOVLX-LABEL: rot16_trunc: +; AVX512NOVLX: # %bb.0: +; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1 +; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX512NOVLX-NEXT: retq +; +; AVX512VLX-LABEL: rot16_trunc: +; AVX512VLX: # %bb.0: +; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1 +; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0 +; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0 +; AVX512VLX-NEXT: retq ; ; XOP-LABEL: rot16_trunc: ; XOP: # %bb.0: