diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d49f25a950e3a..8ff476b87dc5e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52829,6 +52829,91 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT); } +// Check whether this is a shuffle that interleaves the lanes of the two input +// vectors. e.g. when interleaving two v8i32 into a single v16i32 that mask is +// <0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23>. Indices are based +// on the target type. +static bool isLaneInterleaveMask(ArrayRef Mask, MVT VT) { + assert(VT.isVector() && "Expected vector VT."); + + MVT ElemVT = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltBits = ElemVT.getSizeInBits(); + + if (Mask.size() != NumElts) + return false; + + // A lane is 128 bits. + if (EltBits == 0 || (128u % EltBits) != 0) + return false; + + // So 4 for i32, 8 for i16, etc. + unsigned EltsPerLane = 128u / EltBits; + unsigned GroupSize = 2 * EltsPerLane; + + if (NumElts % GroupSize != 0) + return false; + + unsigned Pos = 0; + for (unsigned G = 0; G != (NumElts / GroupSize); ++G) { + // Indices are based on the output type, hence B starts at NumElts. + unsigned ABase = G * EltsPerLane; + unsigned BBase = NumElts + G * EltsPerLane; + + for (unsigned I = 0; I != EltsPerLane; ++I) + if (Mask[Pos++] != (int)(ABase + I)) + return false; + + for (unsigned I = 0; I != EltsPerLane; ++I) + if (Mask[Pos++] != (int)(BBase + I)) + return false; + } + + return true; +} + +// Check whether this is a shuffle that interleaves the lanes of the two input +// vectors. e.g. v16i32 that mask is <0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, +// 20, 21, 22, 23>. +static bool isLaneInterleaveShuffle(MVT VT, SDValue Shuf, SDValue &A, + SDValue &B, const SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // For the _mm_pack{u|s}s variants, the shuffle is trivial and therefore + // elided. + if (VT == MVT::v16i16 || VT == MVT::v8i32) { + if (Shuf.getOpcode() == ISD::CONCAT_VECTORS && Shuf.getNumOperands() == 2) { + A = Shuf->getOperand(0); + B = Shuf->getOperand(1); + return true; + } + + return false; + } + + auto *SVN = dyn_cast(Shuf.getNode()); + if (!SVN) + return false; + + ArrayRef TargetMask = SVN->getMask(); + SDValue V1 = SVN->getOperand(0); + SDValue V2 = SVN->getOperand(1); + + if (isLaneInterleaveMask(TargetMask, VT)) { + auto peelConcat = [](SDValue V) -> SDValue { + if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2) + return V.getOperand(0); + return V; + }; + + // The upper half is undefined. + A = peelConcat(V1); + B = peelConcat(V2); + return true; + } + + return false; +} + /// Detect patterns of truncation with unsigned saturation: /// /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). @@ -52973,42 +53058,68 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, Subtarget); } + if (!(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) + return SDValue(); + + unsigned TruncOpc = 0; + SDValue SatVal; + if (SDValue SSatVal = detectSSatPattern(In, VT)) { + SatVal = SSatVal; + TruncOpc = X86ISD::VTRUNCS; + } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) { + SatVal = USatVal; + TruncOpc = X86ISD::VTRUNCUS; + } else { + return SDValue(); + } + + unsigned ResElts = VT.getVectorNumElements(); + + bool IsEpi16 = (SVT == MVT::i8 && InSVT == MVT::i16); + bool IsEpi32 = (SVT == MVT::i16 && InSVT == MVT::i32); + + // Is there an adventageous pack given the current types and features? + unsigned Width = VT.getSizeInBits(); + bool HasPackForWidth = + (Width == 128 && Subtarget.hasSSE41()) || + (Width == 256 && Subtarget.hasAVX2()) || + (Width == 512 && Subtarget.hasBWI() && Subtarget.hasVLX()); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && - Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) && - (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) { - unsigned TruncOpc = 0; - SDValue SatVal; - if (SDValue SSatVal = detectSSatPattern(In, VT)) { - SatVal = SSatVal; - TruncOpc = X86ISD::VTRUNCS; - } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) { - SatVal = USatVal; - TruncOpc = X86ISD::VTRUNCUS; - } - if (SatVal) { - unsigned ResElts = VT.getVectorNumElements(); - // If the input type is less than 512 bits and we don't have VLX, we need - // to widen to 512 bits. - if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { - unsigned NumConcats = 512 / InVT.getSizeInBits(); - ResElts *= NumConcats; - SmallVector ConcatOps(NumConcats, DAG.getUNDEF(InVT)); - ConcatOps[0] = SatVal; - InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, - NumConcats * InVT.getVectorNumElements()); - SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); - } - // Widen the result if its narrower than 128 bits. - if (ResElts * SVT.getSizeInBits() < 128) - ResElts = 128 / SVT.getSizeInBits(); - EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); - SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, - DAG.getVectorIdxConstant(0, DL)); + if (HasPackForWidth && (IsEpi16 || IsEpi32)) { + SDValue A, B; + if (isLaneInterleaveShuffle(InVT.getSimpleVT(), SatVal, A, B, DAG, + Subtarget)) { + unsigned PackOpc = + TruncOpc == X86ISD::VTRUNCS ? X86ISD::PACKSS : X86ISD::PACKUS; + + return DAG.getNode(PackOpc, DL, VT, A, B); } } + if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && + Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) { + + // If the input type is less than 512 bits and we don't have VLX, we + // need to widen to 512 bits. + if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { + unsigned NumConcats = 512 / InVT.getSizeInBits(); + ResElts *= NumConcats; + SmallVector ConcatOps(NumConcats, DAG.getUNDEF(InVT)); + ConcatOps[0] = SatVal; + InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, + NumConcats * InVT.getVectorNumElements()); + SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); + } + // Widen the result if its narrower than 128 bits. + if (ResElts * SVT.getSizeInBits() < 128) + ResElts = 128 / SVT.getSizeInBits(); + EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); + SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, + DAG.getVectorIdxConstant(0, DL)); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll index 36e374bd2e67c..86d2ae3d0f800 100644 --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -251,18 +251,12 @@ define <8 x i16> @combine_trunc_v8i32_v8i16(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: combine_trunc_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: combine_trunc_v8i32_v8i16: ; SSE42: # %bb.0: -; SSE42-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE42-NEXT: pminud %xmm3, %xmm2 -; SSE42-NEXT: pminud %xmm3, %xmm1 ; SSE42-NEXT: packusdw %xmm2, %xmm1 ; SSE42-NEXT: psubusw %xmm1, %xmm0 ; SSE42-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_packss.ll b/llvm/test/CodeGen/X86/masked_packss.ll new file mode 100644 index 0000000000000..183cfec4a7933 --- /dev/null +++ b/llvm/test/CodeGen/X86/masked_packss.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 + +define <16 x i8> @_mm_mask_packss_epi16_manual(<16 x i8> %src, i16 noundef %k, <8 x i16> %a, <8 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm_mask_packss_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm_mask_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpacksswb %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %minv = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 -128)) + %sat = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %minv, <16 x i16> splat (i16 127)) + %tr = trunc <16 x i16> %sat to <16 x i8> + %mk = bitcast i16 %k to <16 x i1> + %res = select <16 x i1> %mk, <16 x i8> %tr, <16 x i8> %src + ret <16 x i8> %res +} + +define <32 x i8> @_mm256_mask_packss_epi16_manual(<32 x i8> %src, i32 noundef %k, <16 x i16> %a, <16 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm256_mask_packss_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm256_mask_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpacksswb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + %minv = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 -128)) + %sat = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> %minv, <32 x i16> splat (i16 127)) + %tr = trunc <32 x i16> %sat to <32 x i8> + %mk = bitcast i32 %k to <32 x i1> + %res = select <32 x i1> %mk, <32 x i8> %tr, <32 x i8> %src + ret <32 x i8> %res +} + +define <64 x i8> @_mm512_mask_packss_epi16_manual(<64 x i8> %src, i64 noundef %k, <32 x i16> %a, <32 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm512_mask_packss_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpacksswb %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovq %rdi, %xmm4 +; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_mask_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovq %rdi, %k1 +; AVX512-NEXT: vpacksswb %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> + %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128)) + %sat = tail call <64 x i16> @llvm.smin.v64i16(<64 x i16> %minv, <64 x i16> splat (i16 127)) + %tr = trunc <64 x i16> %sat to <64 x i8> + %mk = bitcast i64 %k to <64 x i1> + %res = select <64 x i1> %mk, <64 x i8> %tr, <64 x i8> %src + ret <64 x i8> %res +} + +define <8 x i16> @_mm_mask_packss_epi32_manual(<8 x i16> %src, i8 noundef %k, <4 x i32> %a, <4 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm_mask_packss_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm_mask_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackssdw %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %minv = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 -32768)) + %sat = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %minv, <8 x i32> splat (i32 32767)) + %tr = trunc <8 x i32> %sat to <8 x i16> + %mk = bitcast i8 %k to <8 x i1> + %res = select <8 x i1> %mk, <8 x i16> %tr, <8 x i16> %src + ret <8 x i16> %res +} + +define <16 x i16> @_mm256_mask_packss_epi32_manual(<16 x i16> %src, i16 noundef %k, <8 x i32> %a, <8 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm256_mask_packss_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm256_mask_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackssdw %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + %minv = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 -32768)) + %sat = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %minv, <16 x i32> splat (i32 32767)) + %tr = trunc <16 x i32> %sat to <16 x i16> + %mk = bitcast i16 %k to <16 x i1> + %res = select <16 x i1> %mk, <16 x i16> %tr, <16 x i16> %src + ret <16 x i16> %res +} + +define <32 x i16> @_mm512_mask_packss_epi32_manual(<32 x i16> %src, i32 noundef %k, <16 x i32> %a, <16 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm512_mask_packss_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vpackssdw %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastw %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: shrl $16, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_mask_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackssdw %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768)) + %sat = tail call <32 x i32> @llvm.smin.v32i32(<32 x i32> %minv, <32 x i32> splat (i32 32767)) + %tr = trunc <32 x i32> %sat to <32 x i16> + %mk = bitcast i32 %k to <32 x i1> + %res = select <32 x i1> %mk, <32 x i16> %tr, <32 x i16> %src + ret <32 x i16> %res +} + +declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smax.v32i16(<32 x i16>, <32 x i16>) +declare <64 x i16> @llvm.smax.v64i16(<64 x i16>, <64 x i16>) + +declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.smin.v32i16(<32 x i16>, <32 x i16>) +declare <64 x i16> @llvm.smin.v64i16(<64 x i16>, <64 x i16>) + +declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>) +declare <32 x i32> @llvm.smax.v32i32(<32 x i32>, <32 x i32>) + +declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>) +declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>) diff --git a/llvm/test/CodeGen/X86/masked_packus.ll b/llvm/test/CodeGen/X86/masked_packus.ll new file mode 100644 index 0000000000000..471a5959c9bd9 --- /dev/null +++ b/llvm/test/CodeGen/X86/masked_packus.ll @@ -0,0 +1,197 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 + +define <16 x i8> @_mm_mask_packus_epi16_manual(<16 x i8> %src, i16 noundef %k, <8 x i16> %a, <8 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm_mask_packus_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm_mask_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackuswb %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %sat = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 255)) + %tr = trunc nuw <16 x i16> %sat to <16 x i8> + %mk = bitcast i16 %k to <16 x i1> + %res = select <16 x i1> %mk, <16 x i8> %tr, <16 x i8> %src + ret <16 x i8> %res +} + +define <32 x i8> @_mm256_mask_packus_epi16_manual(<32 x i8> %src, i32 noundef %k, <16 x i16> %a, <16 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm256_mask_packus_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastd %xmm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm256_mask_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackuswb %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + %sat = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 255)) + %tr = trunc nuw <32 x i16> %sat to <32 x i8> + %mk = bitcast i32 %k to <32 x i1> + %res = select <32 x i1> %mk, <32 x i8> %tr, <32 x i8> %src + ret <32 x i8> %res +} + +define <64 x i8> @_mm512_mask_packus_epi16_manual(<64 x i8> %src, i64 noundef %k, <32 x i16> %a, <32 x i16> %b) unnamed_addr { +; AVX2-LABEL: _mm512_mask_packus_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm6 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpminuw %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpminuw %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpackuswb %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpminuw %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpminuw %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vmovq %rdi, %xmm4 +; AVX2-NEXT: vpbroadcastq %xmm4, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[0,0,0,0,0,0,0,0,9,9,9,9,9,9,9,9,18,18,18,18,18,18,18,18,27,27,27,27,27,27,27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm5, %ymm5 +; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[4,4,4,4,4,4,4,4,13,13,13,13,13,13,13,13,22,22,22,22,22,22,22,22,31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqb %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_mask_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovq %rdi, %k1 +; AVX512-NEXT: vpackuswb %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> + %sat = tail call <64 x i16> @llvm.umin.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 255)) + %tr = trunc nuw <64 x i16> %sat to <64 x i8> + %mk = bitcast i64 %k to <64 x i1> + %res = select <64 x i1> %mk, <64 x i8> %tr, <64 x i8> %src + ret <64 x i8> %res +} + +define <8 x i16> @_mm_mask_packus_epi32_manual(<8 x i16> %src, i8 noundef %k, <4 x i32> %a, <4 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm_mask_packus_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm_mask_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackusdw %xmm2, %xmm1, %xmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %sat = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 65535)) + %tr = trunc nuw <8 x i32> %sat to <8 x i16> + %mk = bitcast i8 %k to <8 x i1> + %res = select <8 x i1> %mk, <8 x i16> %tr, <8 x i16> %src + ret <8 x i16> %res +} + +define <16 x i16> @_mm256_mask_packus_epi32_manual(<16 x i16> %src, i16 noundef %k, <8 x i32> %a, <8 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm256_mask_packus_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackusdw %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm256_mask_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackusdw %ymm2, %ymm1, %ymm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + %sat = tail call <16 x i32> @llvm.umin.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 65535)) + %tr = trunc nuw <16 x i32> %sat to <16 x i16> + %mk = bitcast i16 %k to <16 x i1> + %res = select <16 x i1> %mk, <16 x i16> %tr, <16 x i16> %src + ret <16 x i16> %res +} + +define <32 x i16> @_mm512_mask_packus_epi32_manual(<32 x i16> %src, i32 noundef %k, <16 x i32> %a, <16 x i32> %b) unnamed_addr { +; AVX2-LABEL: _mm512_mask_packus_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm3, %ymm6 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm5[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm5 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm4[2,3] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX2-NEXT: vpminud %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpminud %ymm4, %ymm5, %ymm5 +; AVX2-NEXT: vpackusdw %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpminud %ymm4, %ymm3, %ymm3 +; AVX2-NEXT: vpminud %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vpackusdw %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,1,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastw %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqw %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: shrl $16, %edi +; AVX2-NEXT: vmovd %edi, %xmm2 +; AVX2-NEXT: vpbroadcastw %xmm2, %ymm2 +; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqw %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_mask_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: kmovd %edi, %k1 +; AVX512-NEXT: vpackusdw %zmm2, %zmm1, %zmm0 {%k1} +; AVX512-NEXT: retq + %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + %sat = tail call <32 x i32> @llvm.umin.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 65535)) + %tr = trunc nuw <32 x i32> %sat to <32 x i16> + %mk = bitcast i32 %k to <32 x i1> + %res = select <32 x i1> %mk, <32 x i16> %tr, <32 x i16> %src + ret <32 x i16> %res +} + +declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>) +declare <32 x i16> @llvm.umin.v32i16(<32 x i16>, <32 x i16>) +declare <64 x i16> @llvm.umin.v64i16(<64 x i16>, <64 x i16>) + +declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>) +declare <16 x i32> @llvm.umin.v16i32(<16 x i32>, <16 x i32>) +declare <32 x i32> @llvm.umin.v32i32(<32 x i32>, <32 x i32>) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index 4c4b6e78d1f8c..06ef2293fc20c 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -4383,9 +4383,6 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-LABEL: truncstore_v8i32_v8i16: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm4, %xmm4 -; SSE4-NEXT: pmovsxbw {{.*#+}} xmm5 = [65535,0,65535,0,65535,0,65535,0] -; SSE4-NEXT: pminud %xmm5, %xmm1 -; SSE4-NEXT: pminud %xmm5, %xmm0 ; SSE4-NEXT: packusdw %xmm1, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm4, %xmm3 ; SSE4-NEXT: pcmpeqd %xmm4, %xmm2 @@ -7303,9 +7300,6 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, ptr %p, <16 x i8> %mask) { ; SSE4-LABEL: truncstore_v16i16_v16i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm3, %xmm3 -; SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE4-NEXT: pminuw %xmm4, %xmm1 -; SSE4-NEXT: pminuw %xmm4, %xmm0 ; SSE4-NEXT: packuswb %xmm1, %xmm0 ; SSE4-NEXT: pcmpeqb %xmm2, %xmm3 ; SSE4-NEXT: pmovmskb %xmm3, %eax diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll index 35919f65d3de0..20a392134d3c2 100644 --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4,X64-SSE ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; SSE2-LABEL: trunc_ashr_v4i64: @@ -44,6 +46,13 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_ashr_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraq $63, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = ashr <4 x i64> %a, %2 = trunc <4 x i64> %1 to <4 x i32> ret <4 x i32> %2 @@ -103,6 +112,13 @@ define <8 x i16> @trunc_ashr_v4i64_bitcast(<4 x i64> %a0) { ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_ashr_v4i64_bitcast: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsraq $49, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = ashr <4 x i64> %a0, %2 = bitcast <4 x i64> %1 to <8 x i32> %3 = trunc <8 x i32> %2 to <8 x i16> @@ -133,6 +149,13 @@ define <8 x i16> @trunc_ashr_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_ashr_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrad $31, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = ashr <8 x i32> %a, %2 = trunc <8 x i32> %1 to <8 x i16> ret <8 x i16> %2 @@ -224,6 +247,15 @@ define <8 x i16> @trunc_ashr_v4i64_demandedelts(<4 x i64> %a0) { ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_ashr_v4i64_demandedelts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllq $63, %ymm0, %ymm0 +; AVX512-NEXT: vpsraq $63, %ymm0, %ymm0 +; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = shl <4 x i64> %a0, %2 = ashr <4 x i64> %1, %3 = bitcast <4 x i64> %2 to <8 x i32> @@ -246,6 +278,13 @@ define <16 x i8> @packsswb_icmp_zero_128(<8 x i16> %a0) { ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packsswb_icmp_zero_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k0 +; AVX512-NEXT: vpmovm2b %k0, %xmm0 +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <8 x i16> %a0, zeroinitializer %2 = sext <8 x i1> %1 to <8 x i8> %3 = shufflevector <8 x i8> %2, <8 x i8> zeroinitializer, <16 x i32> @@ -266,6 +305,13 @@ define <16 x i8> @packsswb_icmp_zero_trunc_128(<8 x i16> %a0) { ; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packsswb_icmp_zero_trunc_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <8 x i16> %a0, zeroinitializer %2 = sext <8 x i1> %1 to <8 x i16> %3 = shufflevector <8 x i16> %2, <8 x i16> zeroinitializer, <16 x i32> @@ -303,6 +349,13 @@ define <32 x i8> @packsswb_icmp_zero_256(<16 x i16> %a0) { ; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packsswb_icmp_zero_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <16 x i16> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i16> %3 = bitcast <16 x i16> %2 to <32 x i8> @@ -341,9 +394,238 @@ define <32 x i8> @packsswb_icmp_zero_trunc_256(<16 x i16> %a0) { ; AVX2-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packsswb_icmp_zero_trunc_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: movb $-52, %al +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <16 x i16> %a0, zeroinitializer %2 = sext <16 x i1> %1 to <16 x i16> %3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> %4 = trunc <32 x i16> %3 to <32 x i8> ret <32 x i8> %4 } + + +define <16 x i8> @_mm_packss_epi16_manual(<8 x i16> %a, <8 x i16> %b) { +; SSE-LABEL: _mm_packss_epi16_manual: +; SSE: # %bb.0: +; SSE-NEXT: packsswb %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: _mm_packss_epi16_manual: +; AVX: # %bb.0: +; AVX-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %minv = tail call <16 x i16> @llvm.smax.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 -128)) + %sat = tail call <16 x i16> @llvm.smin.v16i16(<16 x i16> %minv, <16 x i16> splat (i16 127)) + %tr = trunc <16 x i16> %sat to <16 x i8> + ret <16 x i8> %tr +} + +define <32 x i8> @_mm256_packss_epi16_manual(<16 x i16> %a, <16 x i16> %b) { +; X86-SSE-LABEL: _mm256_packss_epi16_manual: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %ebp, -8 +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: packsswb %xmm2, %xmm0 +; X86-SSE-NEXT: packsswb 8(%ebp), %xmm1 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE-NEXT: retl +; +; AVX1-LABEL: _mm256_packss_epi16_manual: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpacksswb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: _mm256_packss_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; X64-SSE-LABEL: _mm256_packss_epi16_manual: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: packsswb %xmm2, %xmm0 +; X64-SSE-NEXT: packsswb %xmm3, %xmm1 +; X64-SSE-NEXT: retq +; +; AVX512-LABEL: _mm256_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + %minv = tail call <32 x i16> @llvm.smax.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 -128)) + %sat = tail call <32 x i16> @llvm.smin.v32i16(<32 x i16> %minv, <32 x i16> splat (i16 127)) + %tr = trunc <32 x i16> %sat to <32 x i8> + ret <32 x i8> %tr +} + +define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) { +; X86-SSE-LABEL: _mm512_packss_epi16_manual: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %ebp, -8 +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: packsswb 24(%ebp), %xmm0 +; X86-SSE-NEXT: packsswb 40(%ebp), %xmm1 +; X86-SSE-NEXT: packsswb 56(%ebp), %xmm2 +; X86-SSE-NEXT: packsswb 72(%ebp), %xmm3 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: _mm512_packss_epi16_manual: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: packsswb %xmm4, %xmm0 +; X64-SSE-NEXT: packsswb %xmm5, %xmm1 +; X64-SSE-NEXT: packsswb %xmm6, %xmm2 +; X64-SSE-NEXT: packsswb %xmm7, %xmm3 +; X64-SSE-NEXT: retq +; +; AVX512-LABEL: _mm512_packss_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpacksswb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> + %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128)) + %sat = tail call <64 x i16> @llvm.smin.v64i16(<64 x i16> %minv, <64 x i16> splat (i16 127)) + %tr = trunc <64 x i16> %sat to <64 x i8> + ret <64 x i8> %tr +} + +define <8 x i16> @_mm_packss_epi32_manual(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: _mm_packss_epi32_manual: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm1, %xmm0 +; SSE-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: _mm_packss_epi32_manual: +; AVX: # %bb.0: +; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %minv = tail call <8 x i32> @llvm.smax.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 -32768)) + %sat = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %minv, <8 x i32> splat (i32 32767)) + %tr = trunc <8 x i32> %sat to <8 x i16> + ret <8 x i16> %tr +} + +define <16 x i16> @_mm256_packss_epi32_manual(<8 x i32> %a, <8 x i32> %b) { +; X86-SSE-LABEL: _mm256_packss_epi32_manual: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %ebp, -8 +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE-NEXT: packssdw 8(%ebp), %xmm1 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE-NEXT: retl +; +; AVX1-LABEL: _mm256_packss_epi32_manual: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: _mm256_packss_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; X64-SSE-LABEL: _mm256_packss_epi32_manual: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: packssdw %xmm2, %xmm0 +; X64-SSE-NEXT: packssdw %xmm3, %xmm1 +; X64-SSE-NEXT: retq +; +; AVX512-LABEL: _mm256_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + %minv = tail call <16 x i32> @llvm.smax.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 -32768)) + %sat = tail call <16 x i32> @llvm.smin.v16i32(<16 x i32> %minv, <16 x i32> splat (i32 32767)) + %tr = trunc <16 x i32> %sat to <16 x i16> + ret <16 x i16> %tr +} + +define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) { +; X86-SSE-LABEL: _mm512_packss_epi32_manual: +; X86-SSE: # %bb.0: +; X86-SSE-NEXT: pushl %ebp +; X86-SSE-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE-NEXT: .cfi_offset %ebp, -8 +; X86-SSE-NEXT: movl %esp, %ebp +; X86-SSE-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE-NEXT: andl $-16, %esp +; X86-SSE-NEXT: subl $16, %esp +; X86-SSE-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE-NEXT: packssdw 24(%ebp), %xmm0 +; X86-SSE-NEXT: packssdw 40(%ebp), %xmm1 +; X86-SSE-NEXT: packssdw 56(%ebp), %xmm2 +; X86-SSE-NEXT: packssdw 72(%ebp), %xmm3 +; X86-SSE-NEXT: movl %ebp, %esp +; X86-SSE-NEXT: popl %ebp +; X86-SSE-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE-NEXT: retl +; +; X64-SSE-LABEL: _mm512_packss_epi32_manual: +; X64-SSE: # %bb.0: +; X64-SSE-NEXT: packssdw %xmm4, %xmm0 +; X64-SSE-NEXT: packssdw %xmm5, %xmm1 +; X64-SSE-NEXT: packssdw %xmm6, %xmm2 +; X64-SSE-NEXT: packssdw %xmm7, %xmm3 +; X64-SSE-NEXT: retq +; +; AVX512-LABEL: _mm512_packss_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768)) + %sat = tail call <32 x i32> @llvm.smin.v32i32(<32 x i32> %minv, <32 x i32> splat (i32 32767)) + %tr = trunc <32 x i32> %sat to <32 x i16> + ret <32 x i16> %tr +} diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll index 384e40496d82a..9cc859e1274e7 100644 --- a/llvm/test/CodeGen/X86/packus.ll +++ b/llvm/test/CodeGen/X86/packus.ll @@ -7,6 +7,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,X64-AVX,X64-AVX1 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X86-AVX,X86-AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,X64-AVX,X64-AVX2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX512 define <4 x i32> @trunc_lshr_v4i64(<4 x i64> %a) nounwind { ; SSE2-LABEL: trunc_lshr_v4i64: @@ -39,6 +41,13 @@ define <4 x i32> @trunc_lshr_v4i64(<4 x i64> %a) nounwind { ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_lshr_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $63, %ymm0, %ymm0 +; AVX512-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = lshr <4 x i64> %a, %2 = trunc <4 x i64> %1 to <4 x i32> ret <4 x i32> %2 @@ -75,6 +84,13 @@ define <8 x i16> @trunc_lshr_v4i64_bitcast(<4 x i64> %a0) { ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_lshr_v4i64_bitcast: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $49, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = lshr <4 x i64> %a0, %2 = bitcast <4 x i64> %1 to <8 x i32> %3 = trunc <8 x i32> %2 to <8 x i16> @@ -112,6 +128,13 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind { ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: trunc_lshr_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $31, %ymm0, %ymm0 +; AVX512-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: ret{{[l|q]}} %1 = lshr <8 x i32> %a, %2 = trunc <8 x i32> %1 to <8 x i16> ret <8 x i16> %2 @@ -187,6 +210,13 @@ define <16 x i8> @shuffle_lshr_2v8i16(<8 x i16> %a0, <8 x i16> %a1) { ; AVX-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: shuffle_lshr_2v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $15, %xmm1, %xmm1 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %lshr0 = lshr <8 x i16> %a0, %lshr1 = lshr <8 x i16> %a1, %bc0 = bitcast <8 x i16> %lshr0 to <16 x i8> @@ -216,6 +246,13 @@ define <8 x i16> @shuffle_lshr_2v4i32(<4 x i32> %a0, <4 x i32> %a1) { ; AVX-NEXT: vpsrld $31, %xmm1, %xmm1 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: shuffle_lshr_2v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrld $31, %xmm0, %xmm0 +; AVX512-NEXT: vpsrld $31, %xmm1, %xmm1 +; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %lshr0 = lshr <4 x i32> %a0, %lshr1 = lshr <4 x i32> %a1, %bc0 = bitcast <4 x i32> %lshr0 to <8 x i16> @@ -245,6 +282,13 @@ define <4 x i32> @shuffle_lshr_2v2i64(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: shuffle_lshr_2v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %lshr0 = lshr <2 x i64> %a0, %lshr1 = lshr <2 x i64> %a1, %bc0 = bitcast <2 x i64> %lshr0 to <4 x i32> @@ -274,6 +318,13 @@ define <4 x float> @shuffle_lshr_2v2i64_bitcast(<2 x i64> %a0, <2 x i64> %a1) { ; AVX-NEXT: vpsrlq $63, %xmm1, %xmm1 ; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: shuffle_lshr_2v2i64_bitcast: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsrlq $63, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlq $63, %xmm1, %xmm1 +; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %lshr0 = lshr <2 x i64> %a0, %lshr1 = lshr <2 x i64> %a1, %bc0 = bitcast <2 x i64> %lshr0 to <4 x float> @@ -318,6 +369,13 @@ define <16 x i8> @packuswb_icmp_zero_128(<8 x i16> %a0) { ; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; X64-AVX-NEXT: retq +; +; AVX512-LABEL: packuswb_icmp_zero_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1 +; AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <8 x i16> %a0, zeroinitializer %2 = zext <8 x i1> %1 to <8 x i8> %3 = shufflevector <8 x i8> %2, <8 x i8> zeroinitializer, <16 x i32> @@ -340,6 +398,14 @@ define <16 x i8> @packuswb_icmp_zero_trunc_128(<8 x i16> %a0) { ; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packuswb_icmp_zero_trunc_128: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $15, %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <8 x i16> %a0, zeroinitializer %2 = zext <8 x i1> %1 to <8 x i16> %3 = shufflevector <8 x i16> %2, <8 x i16> zeroinitializer, <16 x i32> @@ -397,6 +463,14 @@ define <32 x i8> @packuswb_icmp_zero_256(<16 x i16> %a0) { ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packuswb_icmp_zero_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $15, %ymm0, %ymm0 +; AVX512-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <16 x i16> %a0, zeroinitializer %2 = zext <16 x i1> %1 to <16 x i16> %3 = bitcast <16 x i16> %2 to <32 x i8> @@ -439,16 +513,985 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) { ; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: packuswb_icmp_zero_trunc_256: +; AVX512: # %bb.0: +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpsrlw $15, %ymm0, %ymm0 +; AVX512-NEXT: movb $-52, %al +; AVX512-NEXT: kmovd %eax, %k1 +; AVX512-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} %1 = icmp eq <16 x i16> %a0, zeroinitializer %2 = zext <16 x i1> %1 to <16 x i16> %3 = shufflevector <16 x i16> zeroinitializer, <16 x i16> %2, <32 x i32> %4 = trunc <32 x i16> %3 to <32 x i8> ret <32 x i8> %4 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; X64-AVX2: {{.*}} -; X64-SSE2: {{.*}} -; X64-SSE4: {{.*}} -; X86-AVX2: {{.*}} -; X86-SSE2: {{.*}} -; X86-SSE4: {{.*}} + +define <16 x i8> @_mm_packus_epi16_manual(<8 x i16> %a, <8 x i16> %b) { +; SSE2-LABEL: _mm_packus_epi16_manual: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psubusw %xmm2, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psubusw %xmm2, %xmm3 +; SSE2-NEXT: psubw %xmm3, %xmm0 +; SSE2-NEXT: packuswb %xmm1, %xmm0 +; SSE2-NEXT: ret{{[l|q]}} +; +; SSE4-LABEL: _mm_packus_epi16_manual: +; SSE4: # %bb.0: +; SSE4-NEXT: packuswb %xmm1, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: _mm_packus_epi16_manual: +; AVX: # %bb.0: +; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> + %sat = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %sh, <16 x i16> splat (i16 255)) + %tr = trunc nuw <16 x i16> %sat to <16 x i8> + ret <16 x i8> %tr +} + +define <32 x i8> @_mm256_packus_epi16_manual(<16 x i16> %a, <16 x i16> %b) { +; X86-SSE2-LABEL: _mm256_packus_epi16_manual: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: .cfi_offset %ebp, -8 +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm1 +; X86-SSE2-NEXT: packuswb %xmm3, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm3 +; X86-SSE2-NEXT: psubw %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm3 +; X86-SSE2-NEXT: psubw %xmm3, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: _mm256_packus_epi16_manual: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X64-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X64-SSE2-NEXT: psubw %xmm5, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X64-SSE2-NEXT: psubw %xmm5, %xmm1 +; X64-SSE2-NEXT: packuswb %xmm3, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm3 +; X64-SSE2-NEXT: psubusw %xmm4, %xmm3 +; X64-SSE2-NEXT: psubw %xmm3, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: psubusw %xmm4, %xmm3 +; X64-SSE2-NEXT: psubw %xmm3, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm2, %xmm0 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: _mm256_packus_epi16_manual: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE4-NEXT: .cfi_offset %ebp, -8 +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X86-SSE4-NEXT: pminuw %xmm3, %xmm1 +; X86-SSE4-NEXT: pminuw %xmm3, %xmm2 +; X86-SSE4-NEXT: pminuw %xmm3, %xmm0 +; X86-SSE4-NEXT: packuswb %xmm2, %xmm0 +; X86-SSE4-NEXT: pminuw 8(%ebp), %xmm3 +; X86-SSE4-NEXT: packuswb %xmm3, %xmm1 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: _mm256_packus_epi16_manual: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X64-SSE4-NEXT: pminuw %xmm4, %xmm3 +; X64-SSE4-NEXT: pminuw %xmm4, %xmm1 +; X64-SSE4-NEXT: packuswb %xmm3, %xmm1 +; X64-SSE4-NEXT: pminuw %xmm4, %xmm2 +; X64-SSE4-NEXT: pminuw %xmm4, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm2, %xmm0 +; X64-SSE4-NEXT: retq +; +; AVX1-LABEL: _mm256_packus_epi16_manual: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: _mm256_packus_epi16_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm256_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> + %sat = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %sh, <32 x i16> splat (i16 255)) + %tr = trunc nuw <32 x i16> %sat to <32 x i8> + ret <32 x i8> %tr +} + +define <64 x i8> @_mm512_packus_epi16_manual(<32 x i16> %a, <32 x i16> %b) { +; X86-SSE2-LABEL: _mm512_packus_epi16_manual: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: .cfi_offset %ebp, -8 +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm6 +; X86-SSE2-NEXT: psubw %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm6 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm6 +; X86-SSE2-NEXT: psubw %xmm6, %xmm3 +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm6 +; X86-SSE2-NEXT: packuswb %xmm5, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE2-NEXT: packuswb %xmm6, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm6 +; X86-SSE2-NEXT: psubw %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm6 +; X86-SSE2-NEXT: psubw %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm6 +; X86-SSE2-NEXT: packuswb %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm6, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: psubusw %xmm4, %xmm5 +; X86-SSE2-NEXT: psubw %xmm5, %xmm0 +; X86-SSE2-NEXT: packuswb %xmm6, %xmm0 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: _mm512_packus_epi16_manual: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; X64-SSE2-NEXT: movdqa %xmm7, %xmm9 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm9 +; X64-SSE2-NEXT: psubw %xmm9, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm9 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm9 +; X64-SSE2-NEXT: psubw %xmm9, %xmm3 +; X64-SSE2-NEXT: packuswb %xmm7, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm7 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm7 +; X64-SSE2-NEXT: psubw %xmm7, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm7 +; X64-SSE2-NEXT: psubw %xmm7, %xmm2 +; X64-SSE2-NEXT: packuswb %xmm6, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm6 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm6 +; X64-SSE2-NEXT: psubw %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm6 +; X64-SSE2-NEXT: psubw %xmm6, %xmm1 +; X64-SSE2-NEXT: packuswb %xmm5, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm5 +; X64-SSE2-NEXT: psubw %xmm5, %xmm4 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: psubusw %xmm8, %xmm5 +; X64-SSE2-NEXT: psubw %xmm5, %xmm0 +; X64-SSE2-NEXT: packuswb %xmm4, %xmm0 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: _mm512_packus_epi16_manual: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE4-NEXT: .cfi_offset %ebp, -8 +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovzxbw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X86-SSE4-NEXT: pminuw %xmm4, %xmm2 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm1 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm3 +; X86-SSE4-NEXT: packuswb %xmm5, %xmm3 +; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm5 +; X86-SSE4-NEXT: packuswb %xmm5, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE4-NEXT: pminuw %xmm4, %xmm5 +; X86-SSE4-NEXT: packuswb %xmm5, %xmm1 +; X86-SSE4-NEXT: pminuw 24(%ebp), %xmm4 +; X86-SSE4-NEXT: packuswb %xmm4, %xmm0 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: _mm512_packus_epi16_manual: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovzxbw {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] +; X64-SSE4-NEXT: pminuw %xmm8, %xmm7 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm3 +; X64-SSE4-NEXT: packuswb %xmm7, %xmm3 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm6 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm2 +; X64-SSE4-NEXT: packuswb %xmm6, %xmm2 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm5 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm1 +; X64-SSE4-NEXT: packuswb %xmm5, %xmm1 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm4 +; X64-SSE4-NEXT: pminuw %xmm8, %xmm0 +; X64-SSE4-NEXT: packuswb %xmm4, %xmm0 +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: _mm512_packus_epi16_manual: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm5 +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminuw 8(%ebp), %xmm3, %xmm2 +; X86-AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpminuw 24(%ebp), %xmm3, %xmm3 +; X86-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: _mm512_packus_epi16_manual: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm5 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm6 +; X64-AVX1-NEXT: vpackuswb %xmm5, %xmm6, %xmm5 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm2, %xmm3 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm6 +; X64-AVX1-NEXT: vpackuswb %xmm3, %xmm6, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpminuw %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: _mm512_packus_epi16_manual: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %ebp, -8 +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm3 +; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpminuw %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminuw %ymm3, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: _mm512_packus_epi16_manual: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpminuw %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpackuswb %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vpminuw %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminuw %ymm3, %ymm4, %ymm2 +; X64-AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_packus_epi16_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> + %sat = tail call <64 x i16> @llvm.umin.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 255)) + %tr = trunc nuw <64 x i16> %sat to <64 x i8> + ret <64 x i8> %tr +} + +define <8 x i16> @_mm_packus_epi32_manual(<4 x i32> %a, <4 x i32> %b) { +; X86-SSE2-LABEL: _mm_packus_epi32_manual: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm4 +; X86-SSE2-NEXT: pxor %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183] +; X86-SSE2-NEXT: movdqa %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; X86-SSE2-NEXT: pcmpeqd %xmm4, %xmm4 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm4, %xmm5 +; X86-SSE2-NEXT: por %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm1, %xmm3 +; X86-SSE2-NEXT: pcmpgtd %xmm3, %xmm2 +; X86-SSE2-NEXT: pxor %xmm2, %xmm4 +; X86-SSE2-NEXT: pand %xmm1, %xmm2 +; X86-SSE2-NEXT: por %xmm4, %xmm2 +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm0 +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: _mm_packus_epi32_manual: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm2, %xmm3 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 +; X64-SSE2-NEXT: pcmpeqd %xmm3, %xmm3 +; X64-SSE2-NEXT: pand %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm3, %xmm5 +; X64-SSE2-NEXT: por %xmm5, %xmm0 +; X64-SSE2-NEXT: pxor %xmm1, %xmm2 +; X64-SSE2-NEXT: pcmpgtd %xmm2, %xmm4 +; X64-SSE2-NEXT: pxor %xmm4, %xmm3 +; X64-SSE2-NEXT: pand %xmm1, %xmm4 +; X64-SSE2-NEXT: por %xmm3, %xmm4 +; X64-SSE2-NEXT: pslld $16, %xmm4 +; X64-SSE2-NEXT: psrad $16, %xmm4 +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm4, %xmm0 +; X64-SSE2-NEXT: retq +; +; SSE4-LABEL: _mm_packus_epi32_manual: +; SSE4: # %bb.0: +; SSE4-NEXT: packusdw %xmm1, %xmm0 +; SSE4-NEXT: ret{{[l|q]}} +; +; AVX-LABEL: _mm_packus_epi32_manual: +; AVX: # %bb.0: +; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %sat = tail call <8 x i32> @llvm.umin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 65535)) + %tr = trunc nuw <8 x i32> %sat to <8 x i16> + ret <8 x i16> %tr +} + +define <16 x i16> @_mm256_packus_epi32_manual(<8 x i32> %a, <8 x i32> %b) { +; X86-SSE2-LABEL: _mm256_packus_epi32_manual: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: .cfi_offset %ebp, -8 +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $32, %esp +; X86-SSE2-NEXT: movaps %xmm2, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm2 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X86-SSE2-NEXT: pxor %xmm5, %xmm6 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; X86-SSE2-NEXT: pand %xmm4, %xmm1 +; X86-SSE2-NEXT: pxor %xmm6, %xmm4 +; X86-SSE2-NEXT: por %xmm4, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X86-SSE2-NEXT: pxor %xmm5, %xmm7 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; X86-SSE2-NEXT: pand %xmm4, %xmm2 +; X86-SSE2-NEXT: pxor %xmm6, %xmm4 +; X86-SSE2-NEXT: por %xmm2, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm2 +; X86-SSE2-NEXT: pxor %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm7 +; X86-SSE2-NEXT: pcmpgtd %xmm2, %xmm7 +; X86-SSE2-NEXT: pand %xmm7, %xmm0 +; X86-SSE2-NEXT: pxor %xmm6, %xmm7 +; X86-SSE2-NEXT: por %xmm7, %xmm0 +; X86-SSE2-NEXT: movdqa (%esp), %xmm2 # 16-byte Reload +; X86-SSE2-NEXT: pxor %xmm2, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm3 +; X86-SSE2-NEXT: pxor %xmm3, %xmm6 +; X86-SSE2-NEXT: pand %xmm2, %xmm3 +; X86-SSE2-NEXT: por %xmm6, %xmm3 +; X86-SSE2-NEXT: pslld $16, %xmm3 +; X86-SSE2-NEXT: psrad $16, %xmm3 +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm0 +; X86-SSE2-NEXT: pslld $16, %xmm4 +; X86-SSE2-NEXT: psrad $16, %xmm4 +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm4, %xmm1 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: _mm256_packus_epi32_manual: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm1, %xmm5 +; X64-SSE2-NEXT: pxor %xmm6, %xmm5 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] +; X64-SSE2-NEXT: movdqa %xmm4, %xmm8 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm8 +; X64-SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; X64-SSE2-NEXT: pand %xmm8, %xmm1 +; X64-SSE2-NEXT: pxor %xmm7, %xmm8 +; X64-SSE2-NEXT: por %xmm8, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm3, %xmm8 +; X64-SSE2-NEXT: pxor %xmm6, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm5 +; X64-SSE2-NEXT: pcmpgtd %xmm8, %xmm5 +; X64-SSE2-NEXT: pand %xmm5, %xmm3 +; X64-SSE2-NEXT: pxor %xmm7, %xmm5 +; X64-SSE2-NEXT: por %xmm3, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm3 +; X64-SSE2-NEXT: pxor %xmm6, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm4, %xmm8 +; X64-SSE2-NEXT: pcmpgtd %xmm3, %xmm8 +; X64-SSE2-NEXT: pand %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm7, %xmm8 +; X64-SSE2-NEXT: por %xmm8, %xmm0 +; X64-SSE2-NEXT: pxor %xmm2, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm4 +; X64-SSE2-NEXT: pxor %xmm4, %xmm7 +; X64-SSE2-NEXT: pand %xmm2, %xmm4 +; X64-SSE2-NEXT: por %xmm7, %xmm4 +; X64-SSE2-NEXT: pslld $16, %xmm4 +; X64-SSE2-NEXT: psrad $16, %xmm4 +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm4, %xmm0 +; X64-SSE2-NEXT: pslld $16, %xmm5 +; X64-SSE2-NEXT: psrad $16, %xmm5 +; X64-SSE2-NEXT: pslld $16, %xmm1 +; X64-SSE2-NEXT: psrad $16, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm5, %xmm1 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: _mm256_packus_epi32_manual: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE4-NEXT: .cfi_offset %ebp, -8 +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] +; X86-SSE4-NEXT: pminud %xmm3, %xmm1 +; X86-SSE4-NEXT: pminud %xmm3, %xmm2 +; X86-SSE4-NEXT: pminud %xmm3, %xmm0 +; X86-SSE4-NEXT: packusdw %xmm2, %xmm0 +; X86-SSE4-NEXT: pminud 8(%ebp), %xmm3 +; X86-SSE4-NEXT: packusdw %xmm3, %xmm1 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: _mm256_packus_epi32_manual: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,0,65535,0,65535,0,65535,0] +; X64-SSE4-NEXT: pminud %xmm4, %xmm3 +; X64-SSE4-NEXT: pminud %xmm4, %xmm1 +; X64-SSE4-NEXT: packusdw %xmm3, %xmm1 +; X64-SSE4-NEXT: pminud %xmm4, %xmm2 +; X64-SSE4-NEXT: pminud %xmm4, %xmm0 +; X64-SSE4-NEXT: packusdw %xmm2, %xmm0 +; X64-SSE4-NEXT: retq +; +; AVX1-LABEL: _mm256_packus_epi32_manual: +; AVX1: # %bb.0: +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm4 +; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-NEXT: ret{{[l|q]}} +; +; AVX2-LABEL: _mm256_packus_epi32_manual: +; AVX2: # %bb.0: +; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: ret{{[l|q]}} +; +; AVX512-LABEL: _mm256_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> + %sat = tail call <16 x i32> @llvm.umin.v16i32(<16 x i32> %sh, <16 x i32> splat (i32 65535)) + %tr = trunc nuw <16 x i32> %sat to <16 x i16> + ret <16 x i16> %tr +} + +define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) { +; X86-SSE2-LABEL: _mm512_packus_epi32_manual: +; X86-SSE2: # %bb.0: +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE2-NEXT: .cfi_offset %ebp, -8 +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $80, %esp +; X86-SSE2-NEXT: movaps %xmm1, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 8(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183] +; X86-SSE2-NEXT: movdqa %xmm3, %xmm4 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm4 +; X86-SSE2-NEXT: pcmpeqd %xmm7, %xmm7 +; X86-SSE2-NEXT: pand %xmm4, %xmm0 +; X86-SSE2-NEXT: pxor %xmm7, %xmm4 +; X86-SSE2-NEXT: por %xmm0, %xmm4 +; X86-SSE2-NEXT: movdqa 72(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X86-SSE2-NEXT: pxor %xmm6, %xmm5 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 +; X86-SSE2-NEXT: pxor %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm2 +; X86-SSE2-NEXT: pxor %xmm7, %xmm5 +; X86-SSE2-NEXT: por %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE2-NEXT: movdqa %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm2 +; X86-SSE2-NEXT: pand %xmm2, %xmm5 +; X86-SSE2-NEXT: pxor %xmm7, %xmm2 +; X86-SSE2-NEXT: por %xmm5, %xmm2 +; X86-SSE2-NEXT: movdqa (%esp), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm1 +; X86-SSE2-NEXT: pxor %xmm7, %xmm5 +; X86-SSE2-NEXT: por %xmm5, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm1, (%esp) # 16-byte Spill +; X86-SSE2-NEXT: movdqa 40(%ebp), %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: pxor %xmm6, %xmm1 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm5 +; X86-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 +; X86-SSE2-NEXT: pand %xmm5, %xmm0 +; X86-SSE2-NEXT: pxor %xmm7, %xmm5 +; X86-SSE2-NEXT: por %xmm0, %xmm5 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE2-NEXT: pxor %xmm6, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm1 +; X86-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload +; X86-SSE2-NEXT: pand %xmm1, %xmm0 +; X86-SSE2-NEXT: pxor %xmm7, %xmm1 +; X86-SSE2-NEXT: por %xmm1, %xmm0 +; X86-SSE2-NEXT: movdqa %xmm0, %xmm1 +; X86-SSE2-NEXT: movdqa 24(%ebp), %xmm0 +; X86-SSE2-NEXT: pxor %xmm0, %xmm6 +; X86-SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; X86-SSE2-NEXT: pxor %xmm3, %xmm7 +; X86-SSE2-NEXT: pand %xmm0, %xmm3 +; X86-SSE2-NEXT: por %xmm7, %xmm3 +; X86-SSE2-NEXT: pslld $16, %xmm3 +; X86-SSE2-NEXT: psrad $16, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X86-SSE2-NEXT: pslld $16, %xmm0 +; X86-SSE2-NEXT: psrad $16, %xmm0 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm0 +; X86-SSE2-NEXT: pslld $16, %xmm5 +; X86-SSE2-NEXT: psrad $16, %xmm5 +; X86-SSE2-NEXT: movdqa (%esp), %xmm1 # 16-byte Reload +; X86-SSE2-NEXT: pslld $16, %xmm1 +; X86-SSE2-NEXT: psrad $16, %xmm1 +; X86-SSE2-NEXT: packssdw %xmm5, %xmm1 +; X86-SSE2-NEXT: pslld $16, %xmm2 +; X86-SSE2-NEXT: psrad $16, %xmm2 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-SSE2-NEXT: pslld $16, %xmm3 +; X86-SSE2-NEXT: psrad $16, %xmm3 +; X86-SSE2-NEXT: packssdw %xmm2, %xmm3 +; X86-SSE2-NEXT: movdqa %xmm3, %xmm2 +; X86-SSE2-NEXT: movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload +; X86-SSE2-NEXT: pslld $16, %xmm3 +; X86-SSE2-NEXT: psrad $16, %xmm3 +; X86-SSE2-NEXT: pslld $16, %xmm4 +; X86-SSE2-NEXT: psrad $16, %xmm4 +; X86-SSE2-NEXT: packssdw %xmm3, %xmm4 +; X86-SSE2-NEXT: movdqa %xmm4, %xmm3 +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp +; X86-SSE2-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE2-NEXT: retl +; +; X64-SSE2-LABEL: _mm512_packus_epi32_manual: +; X64-SSE2: # %bb.0: +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648,2147483648,2147483648] +; X64-SSE2-NEXT: movdqa %xmm3, %xmm8 +; X64-SSE2-NEXT: pxor %xmm10, %xmm8 +; X64-SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183,2147549183,2147549183] +; X64-SSE2-NEXT: movdqa %xmm9, %xmm12 +; X64-SSE2-NEXT: pcmpgtd %xmm8, %xmm12 +; X64-SSE2-NEXT: pcmpeqd %xmm11, %xmm11 +; X64-SSE2-NEXT: pand %xmm12, %xmm3 +; X64-SSE2-NEXT: pxor %xmm11, %xmm12 +; X64-SSE2-NEXT: por %xmm12, %xmm3 +; X64-SSE2-NEXT: movdqa %xmm7, %xmm12 +; X64-SSE2-NEXT: pxor %xmm10, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm8 +; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm8 +; X64-SSE2-NEXT: pand %xmm8, %xmm7 +; X64-SSE2-NEXT: pxor %xmm11, %xmm8 +; X64-SSE2-NEXT: por %xmm7, %xmm8 +; X64-SSE2-NEXT: movdqa %xmm2, %xmm7 +; X64-SSE2-NEXT: pxor %xmm10, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm12 +; X64-SSE2-NEXT: pcmpgtd %xmm7, %xmm12 +; X64-SSE2-NEXT: pand %xmm12, %xmm2 +; X64-SSE2-NEXT: pxor %xmm11, %xmm12 +; X64-SSE2-NEXT: por %xmm12, %xmm2 +; X64-SSE2-NEXT: movdqa %xmm6, %xmm12 +; X64-SSE2-NEXT: pxor %xmm10, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm7 +; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm7 +; X64-SSE2-NEXT: pand %xmm7, %xmm6 +; X64-SSE2-NEXT: pxor %xmm11, %xmm7 +; X64-SSE2-NEXT: por %xmm6, %xmm7 +; X64-SSE2-NEXT: movdqa %xmm1, %xmm6 +; X64-SSE2-NEXT: pxor %xmm10, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm12 +; X64-SSE2-NEXT: pcmpgtd %xmm6, %xmm12 +; X64-SSE2-NEXT: pand %xmm12, %xmm1 +; X64-SSE2-NEXT: pxor %xmm11, %xmm12 +; X64-SSE2-NEXT: por %xmm12, %xmm1 +; X64-SSE2-NEXT: movdqa %xmm5, %xmm12 +; X64-SSE2-NEXT: pxor %xmm10, %xmm12 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm6 +; X64-SSE2-NEXT: pcmpgtd %xmm12, %xmm6 +; X64-SSE2-NEXT: pand %xmm6, %xmm5 +; X64-SSE2-NEXT: pxor %xmm11, %xmm6 +; X64-SSE2-NEXT: por %xmm5, %xmm6 +; X64-SSE2-NEXT: movdqa %xmm0, %xmm5 +; X64-SSE2-NEXT: pxor %xmm10, %xmm5 +; X64-SSE2-NEXT: movdqa %xmm9, %xmm12 +; X64-SSE2-NEXT: pcmpgtd %xmm5, %xmm12 +; X64-SSE2-NEXT: pand %xmm12, %xmm0 +; X64-SSE2-NEXT: pxor %xmm11, %xmm12 +; X64-SSE2-NEXT: por %xmm12, %xmm0 +; X64-SSE2-NEXT: pxor %xmm4, %xmm10 +; X64-SSE2-NEXT: pcmpgtd %xmm10, %xmm9 +; X64-SSE2-NEXT: pxor %xmm9, %xmm11 +; X64-SSE2-NEXT: pand %xmm4, %xmm9 +; X64-SSE2-NEXT: por %xmm11, %xmm9 +; X64-SSE2-NEXT: pslld $16, %xmm9 +; X64-SSE2-NEXT: psrad $16, %xmm9 +; X64-SSE2-NEXT: pslld $16, %xmm0 +; X64-SSE2-NEXT: psrad $16, %xmm0 +; X64-SSE2-NEXT: packssdw %xmm9, %xmm0 +; X64-SSE2-NEXT: pslld $16, %xmm6 +; X64-SSE2-NEXT: psrad $16, %xmm6 +; X64-SSE2-NEXT: pslld $16, %xmm1 +; X64-SSE2-NEXT: psrad $16, %xmm1 +; X64-SSE2-NEXT: packssdw %xmm6, %xmm1 +; X64-SSE2-NEXT: pslld $16, %xmm7 +; X64-SSE2-NEXT: psrad $16, %xmm7 +; X64-SSE2-NEXT: pslld $16, %xmm2 +; X64-SSE2-NEXT: psrad $16, %xmm2 +; X64-SSE2-NEXT: packssdw %xmm7, %xmm2 +; X64-SSE2-NEXT: pslld $16, %xmm8 +; X64-SSE2-NEXT: psrad $16, %xmm8 +; X64-SSE2-NEXT: pslld $16, %xmm3 +; X64-SSE2-NEXT: psrad $16, %xmm3 +; X64-SSE2-NEXT: packssdw %xmm8, %xmm3 +; X64-SSE2-NEXT: retq +; +; X86-SSE4-LABEL: _mm512_packus_epi32_manual: +; X86-SSE4: # %bb.0: +; X86-SSE4-NEXT: pushl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_offset 8 +; X86-SSE4-NEXT: .cfi_offset %ebp, -8 +; X86-SSE4-NEXT: movl %esp, %ebp +; X86-SSE4-NEXT: .cfi_def_cfa_register %ebp +; X86-SSE4-NEXT: andl $-16, %esp +; X86-SSE4-NEXT: subl $16, %esp +; X86-SSE4-NEXT: pmovsxbw {{.*#+}} xmm4 = [65535,0,65535,0,65535,0,65535,0] +; X86-SSE4-NEXT: pminud %xmm4, %xmm2 +; X86-SSE4-NEXT: pminud %xmm4, %xmm1 +; X86-SSE4-NEXT: pminud %xmm4, %xmm0 +; X86-SSE4-NEXT: movdqa 72(%ebp), %xmm5 +; X86-SSE4-NEXT: pminud %xmm4, %xmm5 +; X86-SSE4-NEXT: movdqa 8(%ebp), %xmm3 +; X86-SSE4-NEXT: pminud %xmm4, %xmm3 +; X86-SSE4-NEXT: packusdw %xmm5, %xmm3 +; X86-SSE4-NEXT: movdqa 56(%ebp), %xmm5 +; X86-SSE4-NEXT: pminud %xmm4, %xmm5 +; X86-SSE4-NEXT: packusdw %xmm5, %xmm2 +; X86-SSE4-NEXT: movdqa 40(%ebp), %xmm5 +; X86-SSE4-NEXT: pminud %xmm4, %xmm5 +; X86-SSE4-NEXT: packusdw %xmm5, %xmm1 +; X86-SSE4-NEXT: pminud 24(%ebp), %xmm4 +; X86-SSE4-NEXT: packusdw %xmm4, %xmm0 +; X86-SSE4-NEXT: movl %ebp, %esp +; X86-SSE4-NEXT: popl %ebp +; X86-SSE4-NEXT: .cfi_def_cfa %esp, 4 +; X86-SSE4-NEXT: retl +; +; X64-SSE4-LABEL: _mm512_packus_epi32_manual: +; X64-SSE4: # %bb.0: +; X64-SSE4-NEXT: pmovsxbw {{.*#+}} xmm8 = [65535,0,65535,0,65535,0,65535,0] +; X64-SSE4-NEXT: pminud %xmm8, %xmm7 +; X64-SSE4-NEXT: pminud %xmm8, %xmm3 +; X64-SSE4-NEXT: packusdw %xmm7, %xmm3 +; X64-SSE4-NEXT: pminud %xmm8, %xmm6 +; X64-SSE4-NEXT: pminud %xmm8, %xmm2 +; X64-SSE4-NEXT: packusdw %xmm6, %xmm2 +; X64-SSE4-NEXT: pminud %xmm8, %xmm5 +; X64-SSE4-NEXT: pminud %xmm8, %xmm1 +; X64-SSE4-NEXT: packusdw %xmm5, %xmm1 +; X64-SSE4-NEXT: pminud %xmm8, %xmm4 +; X64-SSE4-NEXT: pminud %xmm8, %xmm0 +; X64-SSE4-NEXT: packusdw %xmm4, %xmm0 +; X64-SSE4-NEXT: retq +; +; X86-AVX1-LABEL: _mm512_packus_epi32_manual: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: pushl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX1-NEXT: .cfi_offset %ebp, -8 +; X86-AVX1-NEXT: movl %esp, %ebp +; X86-AVX1-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX1-NEXT: andl $-32, %esp +; X86-AVX1-NEXT: subl $32, %esp +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [65535,65535,65535,65535] +; X86-AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4 +; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X86-AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm5 +; X86-AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm6 +; X86-AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X86-AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpminud 8(%ebp), %xmm3, %xmm2 +; X86-AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2 +; X86-AVX1-NEXT: vpminud 24(%ebp), %xmm3, %xmm3 +; X86-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; X86-AVX1-NEXT: movl %ebp, %esp +; X86-AVX1-NEXT: popl %ebp +; X86-AVX1-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX1-NEXT: retl +; +; X64-AVX1-LABEL: _mm512_packus_epi32_manual: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm4 = [65535,65535,65535,65535] +; X64-AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm5 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm6 +; X64-AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm3, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm3 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm6 +; X64-AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 +; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-AVX1-NEXT: vpminud %xmm4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 +; X64-AVX1-NEXT: retq +; +; X86-AVX2-LABEL: _mm512_packus_epi32_manual: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: pushl %ebp +; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 +; X86-AVX2-NEXT: .cfi_offset %ebp, -8 +; X86-AVX2-NEXT: movl %esp, %ebp +; X86-AVX2-NEXT: .cfi_def_cfa_register %ebp +; X86-AVX2-NEXT: andl $-32, %esp +; X86-AVX2-NEXT: subl $32, %esp +; X86-AVX2-NEXT: vmovdqa 8(%ebp), %ymm3 +; X86-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; X86-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] +; X86-AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 +; X86-AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; X86-AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpminud %ymm3, %ymm4, %ymm2 +; X86-AVX2-NEXT: vpackusdw %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X86-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; X86-AVX2-NEXT: movl %ebp, %esp +; X86-AVX2-NEXT: popl %ebp +; X86-AVX2-NEXT: .cfi_def_cfa %esp, 4 +; X86-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: _mm512_packus_epi32_manual: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] +; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm2 +; X64-AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] +; X64-AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpminud %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 +; X64-AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpminud %ymm3, %ymm4, %ymm2 +; X64-AVX2-NEXT: vpackusdw %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; X64-AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] +; X64-AVX2-NEXT: retq +; +; AVX512-LABEL: _mm512_packus_epi32_manual: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackusdw %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: ret{{[l|q]}} + %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> + %sat = tail call <32 x i32> @llvm.umin.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 65535)) + %tr = trunc nuw <32 x i32> %sat to <32 x i16> + ret <32 x i16> %tr +} + diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index e10b360b35b56..a26112397053d 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -793,9 +793,6 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test13: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1047,9 +1044,6 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: test15: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1565,9 +1559,6 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_8i32_max: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -1972,9 +1963,6 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin ; ; SSE41-LABEL: psubus_i16_i32_max_swapped: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2067,9 +2055,6 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind { ; ; SSE41-LABEL: psubus_i16_i32_min: ; SSE41: # %bb.0: # %vector.ph -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq @@ -2656,9 +2641,6 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) { ; ; SSE41-LABEL: test32: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm3 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm1 ; SSE41-NEXT: packusdw %xmm2, %xmm1 ; SSE41-NEXT: psubusw %xmm1, %xmm0 ; SSE41-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 0806e4960e48a..930758d734d91 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -1383,9 +1383,6 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { ; ; SSE41-LABEL: trunc_usat_v8i32_v8i16: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbw {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: pminud %xmm2, %xmm0 ; SSE41-NEXT: packusdw %xmm1, %xmm0 ; SSE41-NEXT: retq ; @@ -3424,9 +3421,6 @@ define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { ; ; SSE41-LABEL: trunc_usat_v16i16_v16i8: ; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pminuw %xmm2, %xmm1 -; SSE41-NEXT: pminuw %xmm2, %xmm0 ; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ;