diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 0292f1428a092..4e907fd19e712 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -506,6 +506,8 @@ SDValue VectorLegalizer::Promote(SDValue Op) { return PromoteINT_TO_FP(Op); case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: // Promote the operation by extending the operand. return PromoteFP_TO_INT(Op); } @@ -575,6 +577,7 @@ SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) { SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op) { MVT VT = Op.getSimpleValueType(); MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT); + bool IsStrict = Op->isStrictFPOpcode(); assert(NVT.getVectorNumElements() == VT.getVectorNumElements() && "Vectors have different number of elements!"); @@ -585,17 +588,35 @@ SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op) { TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT)) NewOpc = ISD::FP_TO_SINT; + if (NewOpc == ISD::STRICT_FP_TO_UINT && + TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT)) + NewOpc = ISD::STRICT_FP_TO_SINT; + SDLoc dl(Op); - SDValue Promoted = DAG.getNode(NewOpc, dl, NVT, Op.getOperand(0)); + SDValue Promoted, Chain; + if (IsStrict) { + Promoted = DAG.getNode(NewOpc, dl, {NVT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + Chain = Promoted.getValue(1); + } else + Promoted = DAG.getNode(NewOpc, dl, NVT, Op.getOperand(0)); // Assert that the converted value fits in the original type. If it doesn't // (eg: because the value being converted is too big), then the result of the // original operation was undefined anyway, so the assert is still correct. - Promoted = DAG.getNode(Op->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext - : ISD::AssertSext, - dl, NVT, Promoted, + if (Op->getOpcode() == ISD::FP_TO_UINT || + Op->getOpcode() == ISD::STRICT_FP_TO_UINT) + NewOpc = ISD::AssertZext; + else + NewOpc = ISD::AssertSext; + + Promoted = DAG.getNode(NewOpc, dl, NVT, Promoted, DAG.getValueType(VT.getScalarType())); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); + Promoted = DAG.getNode(ISD::TRUNCATE, dl, VT, Promoted); + if (IsStrict) + return DAG.getMergeValues({Promoted, Chain}, dl); + + return Promoted; } SDValue VectorLegalizer::ExpandLoad(SDValue Op) { diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 592e64842070e..7df6ecdc5ef3c 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -816,7 +816,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() { switch (N->getOpcode()) { case ISD::FP_TO_SINT: - case ISD::FP_TO_UINT: { + case ISD::FP_TO_UINT: + case ISD::STRICT_FP_TO_SINT: + case ISD::STRICT_FP_TO_UINT: { // Replace vector fp_to_s/uint with their X86 specific equivalent so we // don't need 2 sets of patterns. if (!N->getSimpleValueType(0).isVector()) @@ -825,13 +827,27 @@ void X86DAGToDAGISel::PreprocessISelDAG() { unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); + case ISD::STRICT_FP_TO_SINT: case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; + case ISD::STRICT_FP_TO_UINT: case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; } - SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), - N->getOperand(0)); + SDValue Res; + if (N->isStrictFPOpcode()) + Res = + CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, + {N->getOperand(0), N->getOperand(1)}); + else + Res = + CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, + {CurDAG->getEntryNode(), N->getOperand(0)}); --I; - CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); + if (N->isStrictFPOpcode()) { + SDValue From[] = {SDValue(N, 0), SDValue(N, 1)}; + SDValue To[] = {Res.getValue(0), Res.getValue(1)}; + CurDAG->ReplaceAllUsesOfValuesWith(From, To, 2); + } else + CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; CurDAG->DeleteNode(N); continue; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 162f6292ea3a0..63268bc3c0165 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -979,18 +979,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); // Custom legalize these to avoid over promotion or custom promotion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); + for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { + setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::FP_TO_UINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); + } setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal); @@ -1164,9 +1162,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); @@ -1361,12 +1362,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); - setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); + setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); // There is no byte sized k-register load or store without AVX512DQ. if (!Subtarget.hasDQI()) { @@ -1440,16 +1447,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, VT, Custom); } - setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32); - setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32); - setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); + for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) { + setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); + setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); + } + setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal); + setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); @@ -1551,6 +1560,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal); + setOperationAction(ISD::MUL, MVT::v8i64, Legal); } @@ -1641,12 +1653,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, if (Subtarget.hasDQI()) { for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, Legal); - setOperationAction(ISD::UINT_TO_FP, VT, Legal); - setOperationAction(ISD::FP_TO_SINT, VT, Legal); - setOperationAction(ISD::FP_TO_UINT, VT, Legal); - - setOperationAction(ISD::MUL, VT, Legal); + setOperationAction(ISD::SINT_TO_FP, VT, Legal); + setOperationAction(ISD::UINT_TO_FP, VT, Legal); + setOperationAction(ISD::FP_TO_SINT, VT, Legal); + setOperationAction(ISD::FP_TO_UINT, VT, Legal); + setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Legal); + setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Legal); + setOperationAction(ISD::MUL, VT, Legal); } } @@ -1821,8 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && "Unexpected operation action!"); // v2i64 FP_TO_S/UINT(v2f32) custom conversion. - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); } if (Subtarget.hasBWI()) { @@ -19739,31 +19754,57 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { MVT ResVT = MVT::v4i32; MVT TruncVT = MVT::v4i1; - unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; if (!IsSigned && !Subtarget.hasVLX()) { // Widen to 512-bits. ResVT = MVT::v8i32; TruncVT = MVT::v8i1; - Opc = ISD::FP_TO_UINT; + unsigned Opc = IsStrict ? ISD::STRICT_FP_TO_UINT : ISD::FP_TO_UINT; Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, DAG.getUNDEF(MVT::v8f64), Src, DAG.getIntPtrConstant(0, dl)); + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, + {Op.getOperand(0), Src}); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(Opc, dl, ResVT, Src); + Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; } - // FIXME: Strict fp! - assert(!IsStrict && "Unhandled strict operation!"); - SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); + SDValue Res, Chain; + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (IsStrict) { + Res = + DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src}); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, + {DAG.getEntryNode(), Src}); Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, - DAG.getIntPtrConstant(0, dl)); + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, + DAG.getIntPtrConstant(0, dl)); + if (IsStrict) + return DAG.getMergeValues({Res, Chain}, dl); + return Res; } assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!"); if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { - // FIXME: Strict fp! - assert(!IsStrict && "Unhandled strict operation!"); - return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT, - DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, - DAG.getUNDEF(MVT::v2f32))); + SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, + DAG.getUNDEF(MVT::v2f32)); + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + SDValue Res, Chain; + if (IsStrict) { + Res = DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); + Chain = Res.getValue(1); + return DAG.getMergeValues({Res, Chain}, dl); + } + return DAG.getNode(Opc, dl, {VT, MVT::Other}, {DAG.getEntryNode(), Tmp}); } return SDValue(); @@ -23100,6 +23141,26 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } +// We share some nodes between STRICT and non STRICT FP intrinsics. +// For these nodes, we need chain them to entry token if they are not called +// by STRICT FP intrinsics. +static SDValue getProperNode(unsigned Opcode, const SDLoc &dl, EVT VT, + ArrayRef Ops, SelectionDAG &DAG) { + switch (Opcode) { + default: + return DAG.getNode(Opcode, dl, VT, Ops); + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: + case X86ISD::CMPP: + case X86ISD::CMPM: + break; + } + + SmallVector NewOps = {DAG.getEntryNode()}; + NewOps.append(Ops.begin(), Ops.end()); + return DAG.getNode(Opcode, dl, {VT, MVT::Other}, NewOps); +} + SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. @@ -23144,23 +23205,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, MVT VT = Op.getSimpleValueType(); const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); - // We share some nodes between STRICT and non STRICT FP intrinsics. - // For these nodes, we need chain them to entry token if they are not called - // by STRICT FP intrinsics. - auto getProperNode = [&](unsigned Opcode, EVT VT, ArrayRef Ops) { - switch (Opcode) { - default: - return DAG.getNode(Opcode, dl, VT, Ops); - case X86ISD::CMPP: - case X86ISD::CMPM: - break; - } - - SmallVector NewOps = {DAG.getEntryNode()}; - NewOps.append(Ops.begin(), Ops.end()); - return DAG.getNode(Opcode, dl, {VT, MVT::Other}, NewOps); - }; - if (IntrData) { switch(IntrData->Type) { case INTR_TYPE_1OP: { @@ -23178,7 +23222,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1)); + return getProperNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), DAG); } case INTR_TYPE_1OP_SAE: { SDValue Sae = Op.getOperand(2); @@ -23249,8 +23294,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } - return getProperNode(IntrData->Opc0, Op.getValueType(), - {Src1, Src2, Src3}); + return getProperNode(IntrData->Opc0, dl, Op.getValueType(), + {Src1, Src2, Src3}, DAG); } case INTR_TYPE_4OP: return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1), @@ -23274,8 +23319,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!isRoundModeCurDirection(Rnd)) return SDValue(); } - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode( + getProperNode(IntrData->Opc0, dl, VT, Src, DAG), Mask, PassThru, + Subtarget, DAG); } case INTR_TYPE_1OP_MASK_SAE: { SDValue Src = Op.getOperand(1); @@ -23291,8 +23337,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, else return SDValue(); - return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + return getVectorMaskingNode(getProperNode(Opc, dl, VT, Src, DAG), Mask, + PassThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); @@ -23498,8 +23544,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return SDValue(); } //default rounding mode - return getProperNode(IntrData->Opc0, MaskVT, - {Op.getOperand(1), Op.getOperand(2), CC}); + return getProperNode(IntrData->Opc0, dl, MaskVT, + {Op.getOperand(1), Op.getOperand(2), CC}, DAG); } case CMP_MASK_SCALAR_CC: { SDValue Src1 = Op.getOperand(1); @@ -23694,13 +23740,13 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SDValue Mask = Op.getOperand(3); if (isAllOnesConstant(Mask)) - return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); + return getProperNode(IntrData->Opc0, dl, Op.getValueType(), Src, DAG); MVT SrcVT = Src.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, - Mask); + return getProperNode(IntrData->Opc1, dl, Op.getValueType(), + {Src, PassThru, Mask}, DAG); } case CVTPS2PH_MASK: { SDValue Src = Op.getOperand(1); @@ -28566,8 +28612,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue Res; SDValue Chain; if (IsStrict) { - Res = DAG.getNode(ISD::FP_TO_SINT, dl, { PromoteVT, MVT::Other }, - { N->getOperand(0), Src }); + Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, + {N->getOperand(0), Src}); Chain = Res.getValue(1); } else Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); @@ -28610,11 +28656,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // legalization to v8i32<-v8f64. return; } - // FIXME: Strict fp. - assert(!IsStrict && "Missing STRICT_FP_TO_SINT support!"); unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; - SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); + SDValue Res; + SDValue Chain; + if (IsStrict) { + Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, + {N->getOperand(0), Src}); + Chain = Res.getValue(1); + } else + Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, + {DAG.getEntryNode(), Src}); Results.push_back(Res); + if (IsStrict) + Results.push_back(Chain); return; } @@ -34719,7 +34773,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, break; case X86ISD::CVTP2SI: case X86ISD::CVTP2UI: case X86ISD::MCVTP2SI: case X86ISD::MCVTP2UI: - case X86ISD::CVTTP2SI: case X86ISD::CVTTP2UI: case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI: case X86ISD::CVTSI2P: case X86ISD::CVTUI2P: case X86ISD::MCVTSI2P: case X86ISD::MCVTUI2P: @@ -34728,6 +34781,12 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, In.getOperand(0).getValueType() == MVT::v2i64) return N->getOperand(0); // return the bitcast break; + case X86ISD::CVTTP2SI: + case X86ISD::CVTTP2UI: + if (In.getOperand(1).getValueType() == MVT::v2f64 || + In.getOperand(1).getValueType() == MVT::v2i64) + return N->getOperand(0); + break; } } @@ -42431,12 +42490,16 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); // Convert a full vector load into vzload when not all bits are needed. - SDValue In = N->getOperand(0); + SDValue In; + if (N->getOpcode() == X86ISD::CVTTP2SI || N->getOpcode() == X86ISD::CVTTP2UI) + In = N->getOperand(1); + else + In = N->getOperand(0); MVT InVT = In.getSimpleValueType(); if (VT.getVectorNumElements() < InVT.getVectorNumElements() && ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { assert(InVT.is128BitVector() && "Expected 128-bit input vector"); - LoadSDNode *LN = cast(N->getOperand(0)); + LoadSDNode *LN = cast(In); // Unless the load is volatile or atomic. if (LN->isSimple()) { SDLoc dl(N); @@ -42450,9 +42513,13 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, LN->getPointerInfo(), LN->getAlignment(), LN->getMemOperand()->getFlags()); - SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, - DAG.getBitcast(InVT, VZLoad)); - DCI.CombineTo(N, Convert); + SDValue Convert = getProperNode(N->getOpcode(), dl, VT, + DAG.getBitcast(InVT, VZLoad), DAG); + if (Convert->getOpcode() == X86ISD::CVTTP2SI || + Convert->getOpcode() == X86ISD::CVTTP2UI) + DCI.CombineTo(N, Convert.getValue(0), Convert.getValue(1)); + else + DCI.CombineTo(N, Convert); DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); return SDValue(N, 0); } diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 180f70e33f8eb..83a346543c46e 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -7350,29 +7350,29 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in { } defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, + any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, + any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index 047b03ae77c56..6bfbf5abb0ee3 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -623,8 +623,8 @@ def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>; // Vector without rounding mode // cvtt fp-to-int staff -def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>; -def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>; +def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt, [SDNPHasChain]>; +def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt, [SDNPHasChain]>; def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>; def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 83c6f27cbe8c3..a2a5f1f1d4357 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -868,19 +868,19 @@ let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in { } let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in { -defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, +defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG; -defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, +defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG; -defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, +defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG; -defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, +defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG; @@ -926,16 +926,16 @@ let Predicates = [UseAVX] in { } let isCodeGenOnly = 1 in { -defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, +defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC; -defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32, +defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32, "cvttss2si", "cvttss2si", WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC; -defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64, +defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC; -defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, +defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64, "cvttsd2si", "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, @@ -1595,9 +1595,9 @@ def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}", (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">; let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))), + def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))), (VCVTTPD2DQYrr VR256:$src)>; - def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))), + def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))), (VCVTTPD2DQYrm addr:$src)>; } diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index e94f16a562994..551e393544907 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4322,17 +4322,21 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512: ; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1} +; X64-NEXT: vcvttpd2dq %zmm0, %ymm2 +; X64-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X64-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_512: ; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2dq %zmm0, %ymm1 {%k1} +; X86-NEXT: vcvttpd2dq %zmm0, %ymm2 +; X86-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X86-NEXT: vcvttpd2dq {sae}, %zmm0, %ymm0 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl @@ -4373,17 +4377,21 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512: ; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1} +; X64-NEXT: vcvttpd2udq %zmm0, %ymm2 +; X64-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X64-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0 ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_512: ; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2udq %zmm0, %ymm1 {%k1} +; X86-NEXT: vcvttpd2udq %zmm0, %ymm2 +; X86-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X86-NEXT: vcvttpd2udq {sae}, %zmm0, %ymm0 ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; X86-NEXT: retl @@ -4399,7 +4407,8 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1} +; X64-NEXT: vcvttps2dq %zmm0, %zmm2 +; X64-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X64-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq @@ -4407,7 +4416,8 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvttps2dq %zmm0, %zmm1 {%k1} +; X86-NEXT: vcvttps2dq %zmm0, %zmm2 +; X86-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X86-NEXT: vcvttps2dq {sae}, %zmm0, %zmm0 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl @@ -4423,7 +4433,8 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1} +; X64-NEXT: vcvttps2udq %zmm0, %zmm2 +; X64-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X64-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0 ; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; X64-NEXT: retq @@ -4431,7 +4442,8 @@ define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_512: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vcvttps2udq %zmm0, %zmm1 {%k1} +; X86-NEXT: vcvttps2udq %zmm0, %zmm2 +; X86-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1} ; X86-NEXT: vcvttps2udq {sae}, %zmm0, %zmm0 ; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll index 506db572671ab..fc04ae835fb06 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -286,7 +286,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2qq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x7a,0xc8] +; X86-NEXT: vcvttpd2qq %zmm0, %zmm2 # encoding: [0x62,0xf1,0xfd,0x48,0x7a,0xd0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X86-NEXT: vcvttpd2qq {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x18,0x7a,0xc0] ; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -294,7 +295,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2qq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x7a,0xc8] +; X64-NEXT: vcvttpd2qq %zmm0, %zmm2 # encoding: [0x62,0xf1,0xfd,0x48,0x7a,0xd0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X64-NEXT: vcvttpd2qq {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x18,0x7a,0xc0] ; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] @@ -310,7 +312,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2uqq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x78,0xc8] +; X86-NEXT: vcvttpd2uqq %zmm0, %zmm2 # encoding: [0x62,0xf1,0xfd,0x48,0x78,0xd0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X86-NEXT: vcvttpd2uqq {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x18,0x78,0xc0] ; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -318,7 +321,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2uqq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x78,0xc8] +; X64-NEXT: vcvttpd2uqq %zmm0, %zmm2 # encoding: [0x62,0xf1,0xfd,0x48,0x78,0xd0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X64-NEXT: vcvttpd2uqq {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x18,0x78,0xc0] ; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] @@ -334,7 +338,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2qq %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x7a,0xc8] +; X86-NEXT: vcvttps2qq %ymm0, %zmm2 # encoding: [0x62,0xf1,0x7d,0x48,0x7a,0xd0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X86-NEXT: vcvttps2qq {sae}, %ymm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x7a,0xc0] ; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -342,7 +347,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2qq %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x7a,0xc8] +; X64-NEXT: vcvttps2qq %ymm0, %zmm2 # encoding: [0x62,0xf1,0x7d,0x48,0x7a,0xd0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X64-NEXT: vcvttps2qq {sae}, %ymm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x7a,0xc0] ; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] @@ -358,7 +364,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2uqq %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x78,0xc8] +; X86-NEXT: vcvttps2uqq %ymm0, %zmm2 # encoding: [0x62,0xf1,0x7d,0x48,0x78,0xd0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X86-NEXT: vcvttps2uqq {sae}, %ymm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x78,0xc0] ; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] @@ -366,7 +373,8 @@ define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2uqq %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x78,0xc8] +; X64-NEXT: vcvttps2uqq %ymm0, %zmm2 # encoding: [0x62,0xf1,0x7d,0x48,0x78,0xd0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xca] ; X64-NEXT: vcvttps2uqq {sae}, %ymm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x18,0x78,0xc0] ; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 431d6f9d28f00..5afbf5b672059 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -599,17 +599,17 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] ; X86-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] ; X64-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] +; X64-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) @@ -623,17 +623,17 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] ; X86-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] ; X64-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] +; X64-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) @@ -647,17 +647,17 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>, define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] ; X86-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] ; X64-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] +; X64-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) @@ -671,17 +671,17 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>, define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] ; X86-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] ; X64-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] +; X64-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) @@ -695,17 +695,17 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8 define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] ; X86-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] ; X64-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] +; X64-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) @@ -735,14 +735,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load(<2 x float>* %p, ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> zeroinitializer, <4 x i32> @@ -754,14 +756,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load(<2 x float>* %p, ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> zeroinitializer, <4 x i32> @@ -791,14 +795,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2(<2 x float>* %p ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_2: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> undef, <4 x i32> @@ -810,14 +816,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_2(<2 x float>* % ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_2: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> undef, <4 x i32> @@ -845,14 +853,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3(<4 x float>* %p ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128_load_3: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <4 x float>, <4 x float>* %p %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %passthru, i8 %mask) @@ -863,14 +873,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3(<4 x float>* % ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2qq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2qq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2qq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x7a,0x07] +; X64-NEXT: vcvttps2qq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <4 x float>, <4 x float>* %p %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> zeroinitializer, i8 %mask) @@ -882,17 +894,17 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8 define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] ; X86-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] ; X64-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] +; X64-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) @@ -983,17 +995,17 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] ; X86-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] ; X64-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] +; X64-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc8] ; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) @@ -1023,14 +1035,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load(<2 x float>* %p, ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> zeroinitializer, <4 x i32> @@ -1042,14 +1056,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load(<2 x float>* %p ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> zeroinitializer, <4 x i32> @@ -1079,14 +1095,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2(<2 x float>* % ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_2: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> undef, <4 x i32> @@ -1098,14 +1116,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_2(<2 x float>* ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_2: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_2: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <2 x float>, <2 x float>* %p %x0b = shufflevector <2 x float> %x0, <2 x float> undef, <4 x i32> @@ -1133,14 +1153,16 @@ define <2 x i64> @test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3(<4 x float>* % ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128_load_3: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm1 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x0f] +; X64-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <4 x float>, <4 x float>* %p %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %passthru, i8 %mask) @@ -1151,14 +1173,16 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3(<4 x float>* ; X86-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vcvttps2uqq (%eax), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x00] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vcvttps2uqq (%eax), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x00] +; X86-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x78,0x07] +; X64-NEXT: vcvttps2uqq (%rdi), %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0x07] +; X64-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %x0 = load <4 x float>, <4 x float>* %p %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> zeroinitializer, i8 %mask) @@ -1170,17 +1194,17 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] ; X86-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] ; X64-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] +; X64-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc8] ; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 82a19ba41cdd5..2337745bdd3ef 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -682,15 +682,17 @@ define <2 x i64> @test_mm256_mask_cvttpd_epi32(<2 x i64> %__W, i8 zeroext %__U, ; X86-LABEL: test_mm256_mask_cvttpd_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttpd2dq %ymm1, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttpd_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttpd2dq %ymm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2dq %ymm1, %xmm0 {%k1} +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -707,15 +709,17 @@ define <2 x i64> @test_mm256_maskz_cvttpd_epi32(i8 zeroext %__U, <4 x double> %_ ; X86-LABEL: test_mm256_maskz_cvttpd_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttpd_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -794,14 +798,16 @@ define <2 x i64> @test_mm256_mask_cvttpd_epu32(<2 x i64> %__W, i8 zeroext %__U, ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} +; X86-NEXT: vcvttpd2udq %ymm1, %xmm1 +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttpd_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2udq %ymm1, %xmm0 {%k1} +; X64-NEXT: vcvttpd2udq %ymm1, %xmm1 +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -816,14 +822,16 @@ define <2 x i64> @test_mm256_maskz_cvttpd_epu32(i8 zeroext %__U, <4 x double> %_ ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} +; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttpd_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 {%k1} {z} +; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: vzeroupper ; X64-NEXT: retq entry: @@ -836,14 +844,16 @@ define <2 x i64> @test_mm_mask_cvttps_epi32(<2 x i64> %__W, i8 zeroext %__U, <4 ; X86-LABEL: test_mm_mask_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %xmm1, %xmm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %xmm1, %xmm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %xmm1, %xmm0 {%k1} +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 @@ -859,14 +869,16 @@ define <2 x i64> @test_mm_maskz_cvttps_epi32(i8 zeroext %__U, <4 x float> %__A) ; X86-LABEL: test_mm_maskz_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %xmm0, %xmm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %__A) #8 @@ -881,14 +893,16 @@ define <4 x i64> @test_mm256_mask_cvttps_epi32(<4 x i64> %__W, i8 zeroext %__U, ; X86-LABEL: test_mm256_mask_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %ymm1, %ymm1 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} +; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %ymm1, %ymm1 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %ymm1, %ymm0 {%k1} +; X64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X64-NEXT: retq entry: %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 @@ -903,14 +917,16 @@ define <4 x i64> @test_mm256_maskz_cvttps_epi32(i8 zeroext %__U, <8 x float> %__ ; X86-LABEL: test_mm256_maskz_cvttps_epi32: ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: vcvttps2dq %ymm0, %ymm0 ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} +; X86-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttps_epi32: ; X64: # %bb.0: # %entry +; X64-NEXT: vcvttps2dq %ymm0, %ymm0 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2dq %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %__A) #8 @@ -936,13 +952,15 @@ define <2 x i64> @test_mm_mask_cvttps_epu32(<2 x i64> %__W, i8 zeroext %__U, <4 ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} +; X86-NEXT: vcvttps2udq %xmm1, %xmm1 +; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_mask_cvttps_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2udq %xmm1, %xmm0 {%k1} +; X64-NEXT: vcvttps2udq %xmm1, %xmm1 +; X64-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__W to <4 x i32> @@ -956,13 +974,15 @@ define <2 x i64> @test_mm_maskz_cvttps_epu32(i8 zeroext %__U, <4 x float> %__A) ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} +; X86-NEXT: vcvttps2udq %xmm0, %xmm0 +; X86-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm_maskz_cvttps_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2udq %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: vcvttps2udq %xmm0, %xmm0 +; X64-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %__A, <4 x i32> zeroinitializer, i8 %__U) #8 @@ -986,13 +1006,15 @@ define <4 x i64> @test_mm256_mask_cvttps_epu32(<4 x i64> %__W, i8 zeroext %__U, ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} +; X86-NEXT: vcvttps2udq %ymm1, %ymm1 +; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_mask_cvttps_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2udq %ymm1, %ymm0 {%k1} +; X64-NEXT: vcvttps2udq %ymm1, %ymm1 +; X64-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__W to <8 x i32> @@ -1006,13 +1028,15 @@ define <4 x i64> @test_mm256_maskz_cvttps_epu32(i8 zeroext %__U, <8 x float> %__ ; X86: # %bb.0: # %entry ; X86-NEXT: movb {{[0-9]+}}(%esp), %al ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} +; X86-NEXT: vcvttps2udq %ymm0, %ymm0 +; X86-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_maskz_cvttps_epu32: ; X64: # %bb.0: # %entry ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vcvttps2udq %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: vcvttps2udq %ymm0, %ymm0 +; X64-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; X64-NEXT: retq entry: %0 = tail call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %__A, <8 x i32> zeroinitializer, i8 %__U) #8 diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 233b9162c9262..a5658036565be 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -7859,7 +7859,6 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] ; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) @@ -10375,20 +10374,20 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] +; X86-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] +; X64-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) @@ -10402,19 +10401,19 @@ declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8 define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] +; X86-NEXT: vcvttps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] +; X64-NEXT: vcvttps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] +; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) @@ -10427,19 +10426,19 @@ declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8 define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] +; X86-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X86-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] +; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] +; X64-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X64-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] +; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index b5dfa2f11ade8..9dfde40e8e59d 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -3704,8 +3704,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] ; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] @@ -3713,8 +3713,8 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] ; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -3731,16 +3731,16 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] ; X86-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] +; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] ; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] ; X64-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] +; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] ; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) @@ -3756,16 +3756,16 @@ define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] ; X86-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] +; X86-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] ; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] ; X64-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] +; X64-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc8] ; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll new file mode 100644 index 0000000000000..680acf98d2a1f --- /dev/null +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll @@ -0,0 +1,1312 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=SSE-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X86,AVX1-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X64,AVX1-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X86,AVX512-X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=AVX-X64,AVX512-X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,X87 + +declare i1 @llvm.experimental.constrained.fptosi.i1.f32(float, metadata) +declare i8 @llvm.experimental.constrained.fptosi.i8.f32(float, metadata) +declare i16 @llvm.experimental.constrained.fptosi.i16.f32(float, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.f32(float, metadata) +declare i1 @llvm.experimental.constrained.fptoui.i1.f32(float, metadata) +declare i8 @llvm.experimental.constrained.fptoui.i8.f32(float, metadata) +declare i16 @llvm.experimental.constrained.fptoui.i16.f32(float, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.f32(float, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f32(float, metadata) + +declare i1 @llvm.experimental.constrained.fptosi.i1.f64(double, metadata) +declare i8 @llvm.experimental.constrained.fptosi.i8.f64(double, metadata) +declare i16 @llvm.experimental.constrained.fptosi.i16.f64(double, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata) +declare i1 @llvm.experimental.constrained.fptoui.i1.f64(double, metadata) +declare i8 @llvm.experimental.constrained.fptoui.i8.f64(double, metadata) +declare i16 @llvm.experimental.constrained.fptoui.i16.f64(double, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata) + +define i1 @fptosi_f32toi1(float %x) #0 { +; SSE-X86-LABEL: fptosi_f32toi1: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f32toi1: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f32toi1: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f32toi1: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f32toi1: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i1 @llvm.experimental.constrained.fptosi.i1.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fptosi_f32toi8(float %x) #0 { +; SSE-X86-LABEL: fptosi_f32toi8: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f32toi8: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f32toi8: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f32toi8: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f32toi8: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i8 @llvm.experimental.constrained.fptosi.i8.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptosi_f32toi16(float %x) #0 { +; SSE-X86-LABEL: fptosi_f32toi16: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f32toi16: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f32toi16: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f32toi16: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f32toi16: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i16 @llvm.experimental.constrained.fptosi.i16.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptosi_f32toi32(float %x) #0 { +; SSE-X86-LABEL: fptosi_f32toi32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f32toi32: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f32toi32: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f32toi32: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f32toi32: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptosi_f32toi64(float %x) #0 { +; SSE-X86-LABEL: fptosi_f32toi64: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %ebp +; SSE-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE-X86-NEXT: .cfi_offset %ebp, -8 +; SSE-X86-NEXT: movl %esp, %ebp +; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: andl $-8, %esp +; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: flds {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: popl %ebp +; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f32toi64: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %rax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f32toi64: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %ebp +; AVX-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX-X86-NEXT: .cfi_offset %ebp, -8 +; AVX-X86-NEXT: movl %esp, %ebp +; AVX-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX-X86-NEXT: andl $-8, %esp +; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-X86-NEXT: vmovss %xmm0, (%esp) +; AVX-X86-NEXT: flds (%esp) +; AVX-X86-NEXT: fisttpll (%esp) +; AVX-X86-NEXT: movl (%esp), %eax +; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-X86-NEXT: movl %ebp, %esp +; AVX-X86-NEXT: popl %ebp +; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f32toi64: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %rax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f32toi64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: flds 8(%ebp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +define i1 @fptoui_f32toi1(float %x) #0 { +; SSE-X86-LABEL: fptoui_f32toi1: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f32toi1: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f32toi1: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f32toi1: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f32toi1: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i1 @llvm.experimental.constrained.fptoui.i1.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fptoui_f32toi8(float %x) #0 { +; SSE-X86-LABEL: fptoui_f32toi8: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f32toi8: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f32toi8: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f32toi8: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f32toi8: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i8 @llvm.experimental.constrained.fptoui.i8.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptoui_f32toi16(float %x) #0 { +; SSE-X86-LABEL: fptoui_f32toi16: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttss2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f32toi16: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f32toi16: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttss2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f32toi16: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttss2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f32toi16: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: flds {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i16 @llvm.experimental.constrained.fptoui.i16.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptoui_f32toi32(float %x) #0 { +; SSE-X86-LABEL: fptoui_f32toi32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: xorl %ecx, %ecx +; SSE-X86-NEXT: ucomiss %xmm0, %xmm1 +; SSE-X86-NEXT: setbe %cl +; SSE-X86-NEXT: shll $31, %ecx +; SSE-X86-NEXT: movaps %xmm0, %xmm2 +; SSE-X86-NEXT: cmpltss %xmm1, %xmm2 +; SSE-X86-NEXT: andnps %xmm1, %xmm2 +; SSE-X86-NEXT: subss %xmm2, %xmm0 +; SSE-X86-NEXT: cvttss2si %xmm0, %eax +; SSE-X86-NEXT: xorl %ecx, %eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f32toi32: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttss2si %xmm0, %rax +; SSE-X64-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-X64-NEXT: retq +; +; AVX1-X86-LABEL: fptoui_f32toi32: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vmovss %xmm0, (%esp) +; AVX1-X86-NEXT: flds (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl +; +; AVX1-X64-LABEL: fptoui_f32toi32: +; AVX1-X64: # %bb.0: +; AVX1-X64-NEXT: vcvttss2si %xmm0, %rax +; AVX1-X64-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-X64-NEXT: retq +; +; AVX512-X86-LABEL: fptoui_f32toi32: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: vcvttss2usi {{[0-9]+}}(%esp), %eax +; AVX512-X86-NEXT: retl +; +; AVX512-X64-LABEL: fptoui_f32toi32: +; AVX512-X64: # %bb.0: +; AVX512-X64-NEXT: vcvttss2usi %xmm0, %eax +; AVX512-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f32toi32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: flds 8(%ebp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptoui_f32toi64(float %x) #0 { +; SSE-X86-LABEL: fptoui_f32toi64: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %ebp +; SSE-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE-X86-NEXT: .cfi_offset %ebp, -8 +; SSE-X86-NEXT: movl %esp, %ebp +; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: andl $-8, %esp +; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X86-NEXT: movaps %xmm0, %xmm2 +; SSE-X86-NEXT: cmpltss %xmm1, %xmm2 +; SSE-X86-NEXT: andnps %xmm1, %xmm2 +; SSE-X86-NEXT: movaps %xmm0, %xmm3 +; SSE-X86-NEXT: subss %xmm2, %xmm3 +; SSE-X86-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: flds {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: xorl %edx, %edx +; SSE-X86-NEXT: ucomiss %xmm0, %xmm1 +; SSE-X86-NEXT: setbe %dl +; SSE-X86-NEXT: shll $31, %edx +; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: popl %ebp +; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f32toi64: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-X64-NEXT: xorl %ecx, %ecx +; SSE-X64-NEXT: ucomiss %xmm1, %xmm0 +; SSE-X64-NEXT: setae %cl +; SSE-X64-NEXT: shlq $63, %rcx +; SSE-X64-NEXT: movaps %xmm0, %xmm2 +; SSE-X64-NEXT: cmpltss %xmm1, %xmm2 +; SSE-X64-NEXT: andnps %xmm1, %xmm2 +; SSE-X64-NEXT: subss %xmm2, %xmm0 +; SSE-X64-NEXT: cvttss2si %xmm0, %rax +; SSE-X64-NEXT: xorq %rcx, %rax +; SSE-X64-NEXT: retq +; +; AVX1-X86-LABEL: fptoui_f32toi64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-X86-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 +; AVX1-X86-NEXT: vandnps %xmm1, %xmm2, %xmm2 +; AVX1-X86-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX1-X86-NEXT: vmovss %xmm2, (%esp) +; AVX1-X86-NEXT: flds (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: xorl %edx, %edx +; AVX1-X86-NEXT: vucomiss %xmm0, %xmm1 +; AVX1-X86-NEXT: setbe %dl +; AVX1-X86-NEXT: shll $31, %edx +; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl +; +; AVX1-X64-LABEL: fptoui_f32toi64: +; AVX1-X64: # %bb.0: +; AVX1-X64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX1-X64-NEXT: xorl %ecx, %ecx +; AVX1-X64-NEXT: vucomiss %xmm1, %xmm0 +; AVX1-X64-NEXT: setae %cl +; AVX1-X64-NEXT: shlq $63, %rcx +; AVX1-X64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 +; AVX1-X64-NEXT: vandnps %xmm1, %xmm2, %xmm1 +; AVX1-X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvttss2si %xmm0, %rax +; AVX1-X64-NEXT: xorq %rcx, %rax +; AVX1-X64-NEXT: retq +; +; AVX512-X86-LABEL: fptoui_f32toi64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512-X86-NEXT: vcmpltss %xmm1, %xmm0, %k1 +; AVX512-X86-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512-X86-NEXT: xorl %edx, %edx +; AVX512-X86-NEXT: vucomiss %xmm0, %xmm1 +; AVX512-X86-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512-X86-NEXT: vmovss %xmm0, (%esp) +; AVX512-X86-NEXT: flds (%esp) +; AVX512-X86-NEXT: fisttpll (%esp) +; AVX512-X86-NEXT: setbe %dl +; AVX512-X86-NEXT: shll $31, %edx +; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512-X86-NEXT: movl (%esp), %eax +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; +; AVX512-X64-LABEL: fptoui_f32toi64: +; AVX512-X64: # %bb.0: +; AVX512-X64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f32toi64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: flds 8(%ebp) +; CHECK-NEXT: flds {{\.LCPI.*}} +; CHECK-NEXT: fucom %st(1) +; CHECK-NEXT: fnstsw %ax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: # kill: def $ah killed $ah killed $ax +; CHECK-NEXT: sahf +; CHECK-NEXT: setbe %al +; CHECK-NEXT: fldz +; CHECK-NEXT: ja .LBB9_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: fstp %st(0) +; CHECK-NEXT: fldz +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: .LBB9_2: +; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fsubrp %st, %st(1) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb %al, %dl +; CHECK-NEXT: shll $31, %edx +; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +define i8 @fptosi_f64toi8(double %x) #0 { +; SSE-X86-LABEL: fptosi_f64toi8: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f64toi8: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f64toi8: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f64toi8: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f64toi8: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i8 @llvm.experimental.constrained.fptosi.i8.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptosi_f64toi16(double %x) #0 { +; SSE-X86-LABEL: fptosi_f64toi16: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f64toi16: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f64toi16: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f64toi16: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f64toi16: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i16 @llvm.experimental.constrained.fptosi.i16.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptosi_f64toi32(double %x) #0 { +; SSE-X86-LABEL: fptosi_f64toi32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f64toi32: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f64toi32: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f64toi32: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f64toi32: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptosi_f64toi64(double %x) #0 { +; SSE-X86-LABEL: fptosi_f64toi64: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %ebp +; SSE-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE-X86-NEXT: .cfi_offset %ebp, -8 +; SSE-X86-NEXT: movl %esp, %ebp +; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: andl $-8, %esp +; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: popl %ebp +; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptosi_f64toi64: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %rax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptosi_f64toi64: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: pushl %ebp +; AVX-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX-X86-NEXT: .cfi_offset %ebp, -8 +; AVX-X86-NEXT: movl %esp, %ebp +; AVX-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX-X86-NEXT: andl $-8, %esp +; AVX-X86-NEXT: subl $8, %esp +; AVX-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX-X86-NEXT: fldl (%esp) +; AVX-X86-NEXT: fisttpll (%esp) +; AVX-X86-NEXT: movl (%esp), %eax +; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; AVX-X86-NEXT: movl %ebp, %esp +; AVX-X86-NEXT: popl %ebp +; AVX-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptosi_f64toi64: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptosi_f64toi64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: fldl 8(%ebp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +define i1 @fptoui_f64toi1(double %x) #0 { +; SSE-X86-LABEL: fptoui_f64toi1: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f64toi1: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f64toi1: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f64toi1: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f64toi1: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i1 @llvm.experimental.constrained.fptoui.i1.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fptoui_f64toi8(double %x) #0 { +; SSE-X86-LABEL: fptoui_f64toi8: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $al killed $al killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f64toi8: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $al killed $al killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f64toi8: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $al killed $al killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f64toi8: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $al killed $al killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f64toi8: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistps {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i8 @llvm.experimental.constrained.fptoui.i8.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fptoui_f64toi16(double %x) #0 { +; SSE-X86-LABEL: fptoui_f64toi16: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: cvttsd2si {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f64toi16: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %eax +; SSE-X64-NEXT: # kill: def $ax killed $ax killed $eax +; SSE-X64-NEXT: retq +; +; AVX-X86-LABEL: fptoui_f64toi16: +; AVX-X86: # %bb.0: +; AVX-X86-NEXT: vcvttsd2si {{[0-9]+}}(%esp), %eax +; AVX-X86-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X86-NEXT: retl +; +; AVX-X64-LABEL: fptoui_f64toi16: +; AVX-X64: # %bb.0: +; AVX-X64-NEXT: vcvttsd2si %xmm0, %eax +; AVX-X64-NEXT: # kill: def $ax killed $ax killed $eax +; AVX-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f64toi16: +; CHECK: # %bb.0: +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: fldl {{[0-9]+}}(%esp) +; CHECK-NEXT: fnstcw (%esp) +; CHECK-NEXT: movzwl (%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpl {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw (%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 4 +; CHECK-NEXT: retl + %result = call i16 @llvm.experimental.constrained.fptoui.i16.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fptoui_f64toi32(double %x) #0 { +; SSE-X86-LABEL: fptoui_f64toi32: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X86-NEXT: xorl %ecx, %ecx +; SSE-X86-NEXT: ucomisd %xmm0, %xmm1 +; SSE-X86-NEXT: setbe %cl +; SSE-X86-NEXT: shll $31, %ecx +; SSE-X86-NEXT: movapd %xmm0, %xmm2 +; SSE-X86-NEXT: cmpltsd %xmm1, %xmm2 +; SSE-X86-NEXT: andnpd %xmm1, %xmm2 +; SSE-X86-NEXT: subsd %xmm2, %xmm0 +; SSE-X86-NEXT: cvttsd2si %xmm0, %eax +; SSE-X86-NEXT: xorl %ecx, %eax +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f64toi32: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: cvttsd2si %xmm0, %rax +; SSE-X64-NEXT: # kill: def $eax killed $eax killed $rax +; SSE-X64-NEXT: retq +; +; AVX1-X86-LABEL: fptoui_f64toi32: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX1-X86-NEXT: fldl (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl +; +; AVX1-X64-LABEL: fptoui_f64toi32: +; AVX1-X64: # %bb.0: +; AVX1-X64-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-X64-NEXT: # kill: def $eax killed $eax killed $rax +; AVX1-X64-NEXT: retq +; +; AVX512-X86-LABEL: fptoui_f64toi32: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: vcvttsd2usi {{[0-9]+}}(%esp), %eax +; AVX512-X86-NEXT: retl +; +; AVX512-X64-LABEL: fptoui_f64toi32: +; AVX512-X64: # %bb.0: +; AVX512-X64-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f64toi32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: fldl 8(%ebp) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: orl $3072, %eax # imm = 0xC00 +; CHECK-NEXT: movw %ax, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fptoui_f64toi64(double %x) #0 { +; SSE-X86-LABEL: fptoui_f64toi64: +; SSE-X86: # %bb.0: +; SSE-X86-NEXT: pushl %ebp +; SSE-X86-NEXT: .cfi_def_cfa_offset 8 +; SSE-X86-NEXT: .cfi_offset %ebp, -8 +; SSE-X86-NEXT: movl %esp, %ebp +; SSE-X86-NEXT: .cfi_def_cfa_register %ebp +; SSE-X86-NEXT: andl $-8, %esp +; SSE-X86-NEXT: subl $16, %esp +; SSE-X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X86-NEXT: movapd %xmm0, %xmm2 +; SSE-X86-NEXT: cmpltsd %xmm1, %xmm2 +; SSE-X86-NEXT: andnpd %xmm1, %xmm2 +; SSE-X86-NEXT: movapd %xmm0, %xmm3 +; SSE-X86-NEXT: subsd %xmm2, %xmm3 +; SSE-X86-NEXT: movsd %xmm3, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-X86-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-X86-NEXT: xorl %edx, %edx +; SSE-X86-NEXT: ucomisd %xmm0, %xmm1 +; SSE-X86-NEXT: setbe %dl +; SSE-X86-NEXT: shll $31, %edx +; SSE-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; SSE-X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X86-NEXT: movl %ebp, %esp +; SSE-X86-NEXT: popl %ebp +; SSE-X86-NEXT: .cfi_def_cfa %esp, 4 +; SSE-X86-NEXT: retl +; +; SSE-X64-LABEL: fptoui_f64toi64: +; SSE-X64: # %bb.0: +; SSE-X64-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-X64-NEXT: xorl %ecx, %ecx +; SSE-X64-NEXT: ucomisd %xmm1, %xmm0 +; SSE-X64-NEXT: setae %cl +; SSE-X64-NEXT: shlq $63, %rcx +; SSE-X64-NEXT: movapd %xmm0, %xmm2 +; SSE-X64-NEXT: cmpltsd %xmm1, %xmm2 +; SSE-X64-NEXT: andnpd %xmm1, %xmm2 +; SSE-X64-NEXT: subsd %xmm2, %xmm0 +; SSE-X64-NEXT: cvttsd2si %xmm0, %rax +; SSE-X64-NEXT: xorq %rcx, %rax +; SSE-X64-NEXT: retq +; +; AVX1-X86-LABEL: fptoui_f64toi64: +; AVX1-X86: # %bb.0: +; AVX1-X86-NEXT: pushl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX1-X86-NEXT: .cfi_offset %ebp, -8 +; AVX1-X86-NEXT: movl %esp, %ebp +; AVX1-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX1-X86-NEXT: andl $-8, %esp +; AVX1-X86-NEXT: subl $8, %esp +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-X86-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 +; AVX1-X86-NEXT: vandnpd %xmm1, %xmm2, %xmm2 +; AVX1-X86-NEXT: vsubsd %xmm2, %xmm0, %xmm2 +; AVX1-X86-NEXT: vmovsd %xmm2, (%esp) +; AVX1-X86-NEXT: fldl (%esp) +; AVX1-X86-NEXT: fisttpll (%esp) +; AVX1-X86-NEXT: xorl %edx, %edx +; AVX1-X86-NEXT: vucomisd %xmm0, %xmm1 +; AVX1-X86-NEXT: setbe %dl +; AVX1-X86-NEXT: shll $31, %edx +; AVX1-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX1-X86-NEXT: movl (%esp), %eax +; AVX1-X86-NEXT: movl %ebp, %esp +; AVX1-X86-NEXT: popl %ebp +; AVX1-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX1-X86-NEXT: retl +; +; AVX1-X64-LABEL: fptoui_f64toi64: +; AVX1-X64: # %bb.0: +; AVX1-X64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-X64-NEXT: xorl %ecx, %ecx +; AVX1-X64-NEXT: vucomisd %xmm1, %xmm0 +; AVX1-X64-NEXT: setae %cl +; AVX1-X64-NEXT: shlq $63, %rcx +; AVX1-X64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 +; AVX1-X64-NEXT: vandnpd %xmm1, %xmm2, %xmm1 +; AVX1-X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvttsd2si %xmm0, %rax +; AVX1-X64-NEXT: xorq %rcx, %rax +; AVX1-X64-NEXT: retq +; +; AVX512-X86-LABEL: fptoui_f64toi64: +; AVX512-X86: # %bb.0: +; AVX512-X86-NEXT: pushl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 +; AVX512-X86-NEXT: .cfi_offset %ebp, -8 +; AVX512-X86-NEXT: movl %esp, %ebp +; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp +; AVX512-X86-NEXT: andl $-8, %esp +; AVX512-X86-NEXT: subl $8, %esp +; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512-X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512-X86-NEXT: vcmpltsd %xmm1, %xmm0, %k1 +; AVX512-X86-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512-X86-NEXT: xorl %edx, %edx +; AVX512-X86-NEXT: vucomisd %xmm0, %xmm1 +; AVX512-X86-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512-X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX512-X86-NEXT: vmovsd %xmm0, (%esp) +; AVX512-X86-NEXT: fldl (%esp) +; AVX512-X86-NEXT: fisttpll (%esp) +; AVX512-X86-NEXT: setbe %dl +; AVX512-X86-NEXT: shll $31, %edx +; AVX512-X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512-X86-NEXT: movl (%esp), %eax +; AVX512-X86-NEXT: movl %ebp, %esp +; AVX512-X86-NEXT: popl %ebp +; AVX512-X86-NEXT: .cfi_def_cfa %esp, 4 +; AVX512-X86-NEXT: retl +; +; AVX512-X64-LABEL: fptoui_f64toi64: +; AVX512-X64: # %bb.0: +; AVX512-X64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512-X64-NEXT: retq +; +; CHECK-LABEL: fptoui_f64toi64: +; CHECK: # %bb.0: +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: movl %esp, %ebp +; CHECK-NEXT: .cfi_def_cfa_register %ebp +; CHECK-NEXT: andl $-8, %esp +; CHECK-NEXT: subl $16, %esp +; CHECK-NEXT: fldl 8(%ebp) +; CHECK-NEXT: flds {{\.LCPI.*}} +; CHECK-NEXT: fucom %st(1) +; CHECK-NEXT: fnstsw %ax +; CHECK-NEXT: xorl %edx, %edx +; CHECK-NEXT: # kill: def $ah killed $ah killed $ax +; CHECK-NEXT: sahf +; CHECK-NEXT: setbe %al +; CHECK-NEXT: fldz +; CHECK-NEXT: ja .LBB18_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: fstp %st(0) +; CHECK-NEXT: fldz +; CHECK-NEXT: fxch %st(1) +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: fstp %st(1) +; CHECK-NEXT: fsubrp %st, %st(1) +; CHECK-NEXT: fnstcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: orl $3072, %ecx # imm = 0xC00 +; CHECK-NEXT: movw %cx, {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: fistpll {{[0-9]+}}(%esp) +; CHECK-NEXT: fldcw {{[0-9]+}}(%esp) +; CHECK-NEXT: movb %al, %dl +; CHECK-NEXT: shll $31, %edx +; CHECK-NEXT: xorl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %ebp, %esp +; CHECK-NEXT: popl %ebp +; CHECK-NEXT: .cfi_def_cfa %esp, 4 +; CHECK-NEXT: retl + %result = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll index e4fcf54e6950a..51ffc1c48eee0 100644 --- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll +++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll @@ -11,6 +11,16 @@ declare x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double, metad declare x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80, metadata, metadata) declare float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80, metadata, metadata) declare double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80, metadata, metadata) +declare i1 @llvm.experimental.constrained.fptosi.i1.x86_fp80(x86_fp80, metadata) +declare i8 @llvm.experimental.constrained.fptosi.i8.x86_fp80(x86_fp80, metadata) +declare i16 @llvm.experimental.constrained.fptosi.i16.x86_fp80(x86_fp80, metadata) +declare i32 @llvm.experimental.constrained.fptosi.i32.x86_fp80(x86_fp80, metadata) +declare i64 @llvm.experimental.constrained.fptosi.i64.x86_fp80(x86_fp80, metadata) +declare i1 @llvm.experimental.constrained.fptoui.i1.x86_fp80(x86_fp80, metadata) +declare i8 @llvm.experimental.constrained.fptoui.i8.x86_fp80(x86_fp80, metadata) +declare i16 @llvm.experimental.constrained.fptoui.i16.x86_fp80(x86_fp80, metadata) +declare i32 @llvm.experimental.constrained.fptoui.i32.x86_fp80(x86_fp80, metadata) +declare i64 @llvm.experimental.constrained.fptoui.i64.x86_fp80(x86_fp80, metadata) define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp { ; X86-LABEL: fadd_fp80: @@ -190,4 +200,405 @@ define x86_fp80 @fsqrt_fp80(x86_fp80 %a) nounwind strictfp { ret x86_fp80 %ret } +define i1 @fp80_to_sint1(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_sint1: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistps {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_sint1: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistps -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; X64-NEXT: retq + %result = call i1 @llvm.experimental.constrained.fptosi.i1.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fp80_to_sint8(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_sint8: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistps {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_sint8: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistps -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; X64-NEXT: retq + %result = call i8 @llvm.experimental.constrained.fptosi.i8.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fp80_to_sint16(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_sint16: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistps {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_sint16: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistps -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: retq + %result = call i16 @llvm.experimental.constrained.fptosi.i16.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fp80_to_sint32(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_sint32: +; X86: # %bb.0: # %entry +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw (%esp) +; X86-NEXT: movzwl (%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistpl {{[0-9]+}}(%esp) +; X86-NEXT: fldcw (%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_sint32: +; X64: # %bb.0: # %entry +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistpl -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: retq +entry: + %result = call i32 @llvm.experimental.constrained.fptosi.i32.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fp80_to_sint64(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_sint64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: fldt 8(%ebp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistpll {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_sint64: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistpll -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: retq + %result = call i64 @llvm.experimental.constrained.fptosi.i64.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + +define i1 @fp80_to_uint1(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_uint1: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistps {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_uint1: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistps -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; X64-NEXT: retq + %result = call i1 @llvm.experimental.constrained.fptoui.i1.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i1 %result +} + +define i8 @fp80_to_uint8(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_uint8: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistps {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_uint8: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistps -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movb -{{[0-9]+}}(%rsp), %al +; X64-NEXT: retq + %result = call i8 @llvm.experimental.constrained.fptoui.i8.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i8 %result +} + +define i16 @fp80_to_uint16(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_uint16: +; X86: # %bb.0: +; X86-NEXT: subl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 12 +; X86-NEXT: fldt {{[0-9]+}}(%esp) +; X86-NEXT: fnstcw (%esp) +; X86-NEXT: movzwl (%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistpl {{[0-9]+}}(%esp) +; X86-NEXT: fldcw (%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: addl $8, %esp +; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_uint16: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistpl -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %result = call i16 @llvm.experimental.constrained.fptoui.i16.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i16 %result +} + +define i32 @fp80_to_uint32(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_uint32: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: fldt 8(%ebp) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $3072, %eax # imm = 0xC00 +; X86-NEXT: movw %ax, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistpll {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_uint32: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: orl $3072, %eax # imm = 0xC00 +; X64-NEXT: movw %ax, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistpll -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movl -{{[0-9]+}}(%rsp), %eax +; X64-NEXT: retq + %result = call i32 @llvm.experimental.constrained.fptoui.i32.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i32 %result +} + +define i64 @fp80_to_uint64(x86_fp80 %x) #0 { +; X86-LABEL: fp80_to_uint64: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-8, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: fldt 8(%ebp) +; X86-NEXT: flds {{\.LCPI.*}} +; X86-NEXT: fucom %st(1) +; X86-NEXT: fnstsw %ax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: # kill: def $ah killed $ah killed $ax +; X86-NEXT: sahf +; X86-NEXT: setbe %al +; X86-NEXT: fldz +; X86-NEXT: ja .LBB18_2 +; X86-NEXT: # %bb.1: +; X86-NEXT: fstp %st(0) +; X86-NEXT: fldz +; X86-NEXT: fxch %st(1) +; X86-NEXT: .LBB18_2: +; X86-NEXT: fstp %st(1) +; X86-NEXT: fsubrp %st, %st(1) +; X86-NEXT: fnstcw {{[0-9]+}}(%esp) +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl $3072, %ecx # imm = 0xC00 +; X86-NEXT: movw %cx, {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: fistpll {{[0-9]+}}(%esp) +; X86-NEXT: fldcw {{[0-9]+}}(%esp) +; X86-NEXT: movb %al, %dl +; X86-NEXT: shll $31, %edx +; X86-NEXT: xorl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 +; X86-NEXT: retl +; +; X64-LABEL: fp80_to_uint64: +; X64: # %bb.0: +; X64-NEXT: fldt {{[0-9]+}}(%rsp) +; X64-NEXT: flds {{.*}}(%rip) +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: fucomi %st(1), %st +; X64-NEXT: setbe %al +; X64-NEXT: fldz +; X64-NEXT: fxch %st(1) +; X64-NEXT: fcmovnbe %st(1), %st +; X64-NEXT: fstp %st(1) +; X64-NEXT: fsubrp %st, %st(1) +; X64-NEXT: fnstcw -{{[0-9]+}}(%rsp) +; X64-NEXT: movzwl -{{[0-9]+}}(%rsp), %ecx +; X64-NEXT: orl $3072, %ecx # imm = 0xC00 +; X64-NEXT: movw %cx, -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: fistpll -{{[0-9]+}}(%rsp) +; X64-NEXT: fldcw -{{[0-9]+}}(%rsp) +; X64-NEXT: shlq $63, %rax +; X64-NEXT: xorq -{{[0-9]+}}(%rsp), %rax +; X64-NEXT: retq + %result = call i64 @llvm.experimental.constrained.fptoui.i64.x86_fp80(x86_fp80 %x, + metadata !"fpexcept.strict") #0 + ret i64 %result +} + attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll new file mode 100644 index 0000000000000..fdefd937e7eda --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -0,0 +1,2397 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,SSE-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX,AVX-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64 + +declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double>, metadata) +declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double>, metadata) +declare <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float>, metadata) +declare <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32(<2 x float>, metadata) +declare <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32(<2 x float>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float>, metadata) +declare <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f64(<2 x double>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f64(<2 x double>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f32(<2 x float>, metadata) +declare <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f32(<2 x float>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f64(<2 x double>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f64(<2 x double>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f32(<2 x float>, metadata) +declare <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f32(<2 x float>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f32(<4 x float>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f32(<4 x float>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f32(<4 x float>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f32(<4 x float>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f32(<4 x float>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f32(<4 x float>, metadata) + +define <2 x i64> @strict_vector_fptosi_v2f64_to_v2i64(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm1 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, (%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $16, %esp +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm0, (%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: retq +; +; FIXME: This is an unsafe behavior for strict FP +; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i64> %ret +} + +define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: xorpd %xmm1, %xmm1 +; SSE-32-NEXT: xorpd %xmm3, %xmm3 +; SSE-32-NEXT: jb .LBB1_2 +; SSE-32-NEXT: # %bb.1: +; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: .LBB1_2: +; SSE-32-NEXT: movapd %xmm0, %xmm4 +; SSE-32-NEXT: subsd %xmm3, %xmm4 +; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: jb .LBB1_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: .LBB1_4: +; SSE-32-NEXT: subsd %xmm1, %xmm0 +; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %cl +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %edx +; SSE-32-NEXT: orl $3072, %edx # imm = 0xC00 +; SSE-32-NEXT: movw %dx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movzbl %al, %eax +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-32-NEXT: movzbl %cl, %eax +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomisd %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movapd %xmm0, %xmm1 +; SSE-64-NEXT: cmpltsd %xmm2, %xmm1 +; SSE-64-NEXT: andnpd %xmm2, %xmm1 +; SSE-64-NEXT: movapd %xmm0, %xmm3 +; SSE-64-NEXT: subsd %xmm1, %xmm3 +; SSE-64-NEXT: cvttsd2si %xmm3, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomisd %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movapd %xmm0, %xmm3 +; SSE-64-NEXT: cmpltsd %xmm2, %xmm3 +; SSE-64-NEXT: andnpd %xmm2, %xmm3 +; SSE-64-NEXT: subsd %xmm3, %xmm0 +; SSE-64-NEXT: cvttsd2si %xmm0, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB1_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: .LBB1_2: +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB1_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: .LBB1_4: +; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 +; AVX-64-NEXT: vandnpd %xmm1, %xmm2, %xmm2 +; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm2 +; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm1 +; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $16, %esp +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vucomisd %xmm2, %xmm1 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovapd %xmm2, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovsd %xmm1, (%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomisd %xmm2, %xmm0 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm2, %xmm2 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: retq +; +; FIXME: This is an unsafe behavior for strict FP +; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i64> %ret +} + +define <2 x i64> @strict_vector_fptosi_v2f32_to_v2i64(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $16, %esp +; AVX512VL-32-NEXT: vmovd %xmm0, (%esp) +; AVX512VL-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-32-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; AVX512DQ-32-NEXT: vcvttps2qq %ymm1, %zmm1 +; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,zero,zero +; AVX512DQ-32-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-32-NEXT: vmovd %xmm0, %eax +; AVX512DQ-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512DQ-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512DQ-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512DQ-32-NEXT: vzeroupper +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: strict_vector_fptosi_v2f32_to_v2i64: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512DQ-64-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512DQ-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512DQ-64-NEXT: vmovq %rax, %xmm0 +; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-64-NEXT: retq + %ret = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i64> %ret +} + +define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: movaps %xmm0, %xmm2 +; SSE-32-NEXT: cmpltss %xmm1, %xmm2 +; SSE-32-NEXT: andnps %xmm1, %xmm2 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm2, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movaps %xmm0, %xmm2 +; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] +; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: cmpltss %xmm1, %xmm3 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: subss %xmm3, %xmm4 +; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm1, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm1, %xmm2 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomiss %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movaps %xmm0, %xmm1 +; SSE-64-NEXT: cmpltss %xmm2, %xmm1 +; SSE-64-NEXT: andnps %xmm2, %xmm1 +; SSE-64-NEXT: movaps %xmm0, %xmm3 +; SSE-64-NEXT: subss %xmm1, %xmm3 +; SSE-64-NEXT: cvttss2si %xmm3, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomiss %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movaps %xmm0, %xmm3 +; SSE-64-NEXT: cmpltss %xmm2, %xmm3 +; SSE-64-NEXT: andnps %xmm2, %xmm3 +; SSE-64-NEXT: subss %xmm3, %xmm0 +; SSE-64-NEXT: cvttss2si %xmm0, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-32-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 +; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 +; AVX-32-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 +; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 +; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: xorl %eax, %eax +; AVX-32-NEXT: vucomiss %xmm2, %xmm1 +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: xorl %ecx, %ecx +; AVX-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 +; AVX-64-NEXT: vandnps %xmm1, %xmm2, %xmm2 +; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX-64-NEXT: vcvttss2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm1 +; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $16, %esp +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vcmpltss %xmm2, %xmm1, %k1 +; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vcmpltss %xmm2, %xmm0, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm2, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm0, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vucomiss %xmm2, %xmm1 +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-32-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX512DQ-32: # %bb.0: +; AVX512DQ-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX512DQ-32-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; AVX512DQ-32-NEXT: vcvttps2uqq %ymm1, %zmm1 +; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],zero,zero,zero +; AVX512DQ-32-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-32-NEXT: vmovd %xmm0, %eax +; AVX512DQ-32-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512DQ-32-NEXT: vpextrd $1, %xmm0, %eax +; AVX512DQ-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512DQ-32-NEXT: vzeroupper +; AVX512DQ-32-NEXT: retl +; +; AVX512DQ-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64: +; AVX512DQ-64: # %bb.0: +; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512DQ-64-NEXT: vmovq %rax, %xmm1 +; AVX512DQ-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512DQ-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512DQ-64-NEXT: vmovq %rax, %xmm0 +; AVX512DQ-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512DQ-64-NEXT: retq + %ret = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i64> %ret +} + +define <2 x i32> @strict_vector_fptosi_v2f64_to_v2i32(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i32> %ret +} + +define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movapd %xmm0, %xmm1 +; SSE-32-NEXT: cmpltsd %xmm2, %xmm1 +; SSE-32-NEXT: andnpd %xmm2, %xmm1 +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: subsd %xmm1, %xmm3 +; SSE-32-NEXT: cvttsd2si %xmm3, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm1 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movapd %xmm0, %xmm3 +; SSE-32-NEXT: cmpltsd %xmm2, %xmm3 +; SSE-32-NEXT: andnpd %xmm2, %xmm3 +; SSE-32-NEXT: subsd %xmm3, %xmm0 +; SSE-32-NEXT: cvttsd2si %xmm0, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-32-NEXT: movdqa %xmm1, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm1 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, (%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %eax +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttsd2usi %xmm1, %eax +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i32> %ret +} + +define <2 x i32> @strict_vector_fptosi_v2f32_to_v2i32(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-32-NEXT: movdqa %xmm1, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: movd %eax, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i32: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vcvttss2si %xmm1, %eax +; AVX-NEXT: vcvttss2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %eax +; AVX512VL-NEXT: vcvttss2si %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2si %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i32> %ret +} + +define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm0, %xmm1 +; SSE-32-NEXT: cmpltss %xmm2, %xmm1 +; SSE-32-NEXT: andnps %xmm2, %xmm1 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm1, %xmm3 +; SSE-32-NEXT: cvttss2si %xmm3, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: cmpltss %xmm2, %xmm3 +; SSE-32-NEXT: andnps %xmm2, %xmm3 +; SSE-32-NEXT: subss %xmm3, %xmm0 +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-32-NEXT: movdqa %xmm1, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm1, %rax +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm0 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %eax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i32> @llvm.experimental.constrained.fptoui.v2i32.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i32> %ret +} + +define <2 x i16> @strict_vector_fptosi_v2f64_to_v2i16(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i16: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i16: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i16> %ret +} + +define <2 x i16> @strict_vector_fptoui_v2f64_to_v2i16(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i16: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i16: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i16> %ret +} + +define <2 x i16> @strict_vector_fptosi_v2f32_to_v2i16(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i16: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i16: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %ecx +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vcvttss2si %xmm1, %eax +; AVX-NEXT: vcvttss2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %eax +; AVX512VL-NEXT: vcvttss2si %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2si %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptosi.v2i16.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i16> %ret +} + +define <2 x i16> @strict_vector_fptoui_v2f32_to_v2i16(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i16: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: movd %eax, %xmm0 +; SSE-32-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i16: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %ecx +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: pinsrw $1, %ecx, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v2f32_to_v2i16: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vcvttss2si %xmm1, %eax +; AVX-NEXT: vcvttss2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %eax +; AVX512VL-NEXT: vcvttss2si %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2si %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i16> @llvm.experimental.constrained.fptoui.v2i16.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i16> %ret +} + +define <2 x i8> @strict_vector_fptosi_v2f64_to_v2i8(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.LCPI.*}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f64_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i8> %ret +} + +define <2 x i8> @strict_vector_fptoui_v2f64_to_v2i8(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-32-NEXT: andpd {{\.LCPI.*}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttpd2dq %xmm0, %xmm0 +; SSE-64-NEXT: andpd {{.*}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v2f64_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i8> %ret +} + +define <2 x i8> @strict_vector_fptosi_v2f32_to_v2i8(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: movzbl %al, %eax +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: shll $8, %ecx +; SSE-32-NEXT: orl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: movzbl %al, %eax +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %ecx +; SSE-64-NEXT: shll $8, %ecx +; SSE-64-NEXT: orl %eax, %ecx +; SSE-64-NEXT: movd %ecx, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v2f32_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vcvttss2si %xmm1, %eax +; AVX-NEXT: vcvttss2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %eax +; AVX512VL-NEXT: vcvttss2si %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2si %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptosi.v2i8.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i8> %ret +} + +define <2 x i8> @strict_vector_fptoui_v2f32_to_v2i8(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttss2si %xmm0, %eax +; SSE-32-NEXT: movzbl %al, %eax +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: shll $8, %ecx +; SSE-32-NEXT: orl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %eax +; SSE-64-NEXT: movzbl %al, %eax +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %ecx +; SSE-64-NEXT: shll $8, %ecx +; SSE-64-NEXT: orl %eax, %ecx +; SSE-64-NEXT: movd %ecx, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v2f32_to_v2i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-NEXT: vcvttss2si %xmm1, %eax +; AVX-NEXT: vcvttss2si %xmm0, %ecx +; AVX-NEXT: vmovd %ecx, %xmm0 +; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm1, %eax +; AVX512VL-NEXT: vcvttss2si %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2si %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm0 +; AVX512DQ-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i8> @llvm.experimental.constrained.fptoui.v2i8.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i8> %ret +} + +define <2 x i1> @strict_vector_fptosi_v2f64_to_v2i1(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movhps %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movlps %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm1 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-64-NEXT: cvttsd2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, (%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f64_to_v2i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i1> %ret +} + +define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: xorpd %xmm1, %xmm1 +; SSE-32-NEXT: xorpd %xmm3, %xmm3 +; SSE-32-NEXT: jb .LBB17_2 +; SSE-32-NEXT: # %bb.1: +; SSE-32-NEXT: movapd %xmm2, %xmm3 +; SSE-32-NEXT: .LBB17_2: +; SSE-32-NEXT: movapd %xmm0, %xmm4 +; SSE-32-NEXT: subsd %xmm3, %xmm4 +; SSE-32-NEXT: movsd %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; SSE-32-NEXT: orl $3072, %ecx # imm = 0xC00 +; SSE-32-NEXT: movw %cx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-32-NEXT: ucomisd %xmm2, %xmm0 +; SSE-32-NEXT: jb .LBB17_4 +; SSE-32-NEXT: # %bb.3: +; SSE-32-NEXT: movapd %xmm2, %xmm1 +; SSE-32-NEXT: .LBB17_4: +; SSE-32-NEXT: subsd %xmm1, %xmm0 +; SSE-32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: setae %cl +; SSE-32-NEXT: fldl {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %edx +; SSE-32-NEXT: orl $3072, %edx # imm = 0xC00 +; SSE-32-NEXT: movw %dx, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movzbl %al, %eax +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-32-NEXT: movzbl %cl, %eax +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomisd %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movapd %xmm0, %xmm1 +; SSE-64-NEXT: cmpltsd %xmm2, %xmm1 +; SSE-64-NEXT: andnpd %xmm2, %xmm1 +; SSE-64-NEXT: movapd %xmm0, %xmm3 +; SSE-64-NEXT: subsd %xmm1, %xmm3 +; SSE-64-NEXT: cvttsd2si %xmm3, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomisd %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movapd %xmm0, %xmm3 +; SSE-64-NEXT: cmpltsd %xmm2, %xmm3 +; SSE-64-NEXT: andnpd %xmm2, %xmm3 +; SSE-64-NEXT: subsd %xmm3, %xmm0 +; SSE-64-NEXT: cvttsd2si %xmm0, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB17_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: .LBB17_2: +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB17_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: .LBB17_4: +; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm2 +; AVX-64-NEXT: vandnpd %xmm1, %xmm2, %xmm2 +; AVX-64-NEXT: vsubsd %xmm2, %xmm0, %xmm2 +; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm1 +; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f64_to_v2i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512DQ-NEXT: vmovd %eax, %xmm1 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttsd2usi %xmm2, %ecx +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %edx +; AVX512DQ-NEXT: vmovd %edx, %xmm0 +; AVX512DQ-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f64(<2 x double> %a, + metadata !"fpexcept.strict") + ret <2 x i1> %ret +} + +define <2 x i1> @strict_vector_fptosi_v2f32_to_v2i1(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: movss %xmm0, {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movq %rax, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovss %xmm0, (%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2si %xmm0, %eax +; AVX512VL-NEXT: andl $1, %eax +; AVX512VL-NEXT: kmovw %eax, %k0 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %eax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VL-NEXT: korw %k1, %k0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v2f32_to_v2i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 +; AVX512DQ-NEXT: vcvttss2si %xmm0, %eax +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftrb $7, %k1, %k1 +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptosi.v2i1.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i1> %ret +} + +define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: pushl %ebp +; SSE-32-NEXT: .cfi_def_cfa_offset 8 +; SSE-32-NEXT: .cfi_offset %ebp, -8 +; SSE-32-NEXT: movl %esp, %ebp +; SSE-32-NEXT: .cfi_def_cfa_register %ebp +; SSE-32-NEXT: andl $-8, %esp +; SSE-32-NEXT: subl $24, %esp +; SSE-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-32-NEXT: movaps %xmm0, %xmm2 +; SSE-32-NEXT: cmpltss %xmm1, %xmm2 +; SSE-32-NEXT: andnps %xmm1, %xmm2 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: subss %xmm2, %xmm3 +; SSE-32-NEXT: movss %xmm3, {{[0-9]+}}(%esp) +; SSE-32-NEXT: movaps %xmm0, %xmm2 +; SSE-32-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3] +; SSE-32-NEXT: movaps %xmm2, %xmm3 +; SSE-32-NEXT: cmpltss %xmm1, %xmm3 +; SSE-32-NEXT: andnps %xmm1, %xmm3 +; SSE-32-NEXT: movaps %xmm2, %xmm4 +; SSE-32-NEXT: subss %xmm3, %xmm4 +; SSE-32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: flds {{[0-9]+}}(%esp) +; SSE-32-NEXT: fnstcw (%esp) +; SSE-32-NEXT: movzwl (%esp), %eax +; SSE-32-NEXT: orl $3072, %eax # imm = 0xC00 +; SSE-32-NEXT: movw %ax, {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw {{[0-9]+}}(%esp) +; SSE-32-NEXT: fistpll {{[0-9]+}}(%esp) +; SSE-32-NEXT: fldcw (%esp) +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm1, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm3 +; SSE-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm1, %xmm2 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; SSE-32-NEXT: movd %eax, %xmm1 +; SSE-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE-32-NEXT: movl %ebp, %esp +; SSE-32-NEXT: popl %ebp +; SSE-32-NEXT: .cfi_def_cfa %esp, 4 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomiss %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movaps %xmm0, %xmm1 +; SSE-64-NEXT: cmpltss %xmm2, %xmm1 +; SSE-64-NEXT: andnps %xmm2, %xmm1 +; SSE-64-NEXT: movaps %xmm0, %xmm3 +; SSE-64-NEXT: subss %xmm1, %xmm3 +; SSE-64-NEXT: cvttss2si %xmm3, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: xorl %eax, %eax +; SSE-64-NEXT: ucomiss %xmm2, %xmm0 +; SSE-64-NEXT: setae %al +; SSE-64-NEXT: shlq $63, %rax +; SSE-64-NEXT: movaps %xmm0, %xmm3 +; SSE-64-NEXT: cmpltss %xmm2, %xmm3 +; SSE-64-NEXT: andnps %xmm2, %xmm3 +; SSE-64-NEXT: subss %xmm3, %xmm0 +; SSE-64-NEXT: cvttss2si %xmm0, %rcx +; SSE-64-NEXT: xorq %rax, %rcx +; SSE-64-NEXT: movq %rcx, %xmm0 +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $16, %esp +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-32-NEXT: vcmpltss %xmm2, %xmm1, %xmm3 +; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 +; AVX-32-NEXT: vsubss %xmm3, %xmm1, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vcmpltss %xmm2, %xmm0, %xmm3 +; AVX-32-NEXT: vandnps %xmm2, %xmm3, %xmm3 +; AVX-32-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: xorl %eax, %eax +; AVX-32-NEXT: vucomiss %xmm2, %xmm1 +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: xorl %ecx, %ecx +; AVX-32-NEXT: vucomiss %xmm2, %xmm0 +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm2 +; AVX-64-NEXT: vandnps %xmm1, %xmm2, %xmm2 +; AVX-64-NEXT: vsubss %xmm2, %xmm0, %xmm2 +; AVX-64-NEXT: vcvttss2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm1 +; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttss2si %xmm0, %eax +; AVX512VL-NEXT: andl $1, %eax +; AVX512VL-NEXT: kmovw %eax, %k0 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2si %xmm0, %eax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VL-NEXT: korw %k1, %k0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v2f32_to_v2i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2si %xmm1, %eax +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 +; AVX512DQ-NEXT: vcvttss2si %xmm0, %eax +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftrb $7, %k1, %k1 +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <2 x i1> @llvm.experimental.constrained.fptoui.v2i1.v2f32(<2 x float> %a, + metadata !"fpexcept.strict") + ret <2 x i1> %ret +} + +define <4 x i32> @strict_vector_fptosi_v4f32_to_v4i32(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v4f32_to_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v4f32_to_v4i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v4f32_to_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i32> %ret +} + +define <4 x i32> @strict_vector_fptoui_v4f32_to_v4i32(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; SSE-32: # %bb.0: +; SSE-32-NEXT: movaps %xmm0, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm1 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm1, %xmm3 +; SSE-32-NEXT: cmpltss %xmm2, %xmm3 +; SSE-32-NEXT: andnps %xmm2, %xmm3 +; SSE-32-NEXT: subss %xmm3, %xmm1 +; SSE-32-NEXT: cvttss2si %xmm1, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm1 +; SSE-32-NEXT: movaps %xmm0, %xmm3 +; SSE-32-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm3 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm3, %xmm4 +; SSE-32-NEXT: cmpltss %xmm2, %xmm4 +; SSE-32-NEXT: andnps %xmm2, %xmm4 +; SSE-32-NEXT: subss %xmm4, %xmm3 +; SSE-32-NEXT: cvttss2si %xmm3, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm3 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm0, %xmm1 +; SSE-32-NEXT: cmpltss %xmm2, %xmm1 +; SSE-32-NEXT: andnps %xmm2, %xmm1 +; SSE-32-NEXT: movaps %xmm0, %xmm4 +; SSE-32-NEXT: subss %xmm1, %xmm4 +; SSE-32-NEXT: cvttss2si %xmm4, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm1 +; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-32-NEXT: xorl %eax, %eax +; SSE-32-NEXT: ucomiss %xmm2, %xmm0 +; SSE-32-NEXT: setae %al +; SSE-32-NEXT: shll $31, %eax +; SSE-32-NEXT: movaps %xmm0, %xmm4 +; SSE-32-NEXT: cmpltss %xmm2, %xmm4 +; SSE-32-NEXT: andnps %xmm2, %xmm4 +; SSE-32-NEXT: subss %xmm4, %xmm0 +; SSE-32-NEXT: cvttss2si %xmm0, %ecx +; SSE-32-NEXT: xorl %eax, %ecx +; SSE-32-NEXT: movd %ecx, %xmm0 +; SSE-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] +; SSE-32-NEXT: movdqa %xmm1, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; SSE-64: # %bb.0: +; SSE-64-NEXT: movaps %xmm0, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] +; SSE-64-NEXT: cvttss2si %xmm1, %rax +; SSE-64-NEXT: movd %eax, %xmm1 +; SSE-64-NEXT: movaps %xmm0, %xmm2 +; SSE-64-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] +; SSE-64-NEXT: cvttss2si %xmm2, %rax +; SSE-64-NEXT: movd %eax, %xmm2 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm1 +; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE-64-NEXT: cvttss2si %xmm0, %rax +; SSE-64-NEXT: movd %eax, %xmm0 +; SSE-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-64-NEXT: movdqa %xmm1, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-32-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $3, %xmm0, (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm1, %rax +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm1 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-64-NEXT: vcvttss2si %xmm2, %rax +; AVX-64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %eax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm2, %eax +; AVX512VL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %eax +; AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm1, %eax +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm1 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttss2usi %xmm2, %eax +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i32> %ret +} + +define <4 x i8> @strict_vector_fptosi_v4f32_to_v4i8(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v4f32_to_v4i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v4f32_to_v4i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v4f32_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i8> %ret +} + +define <4 x i8> @strict_vector_fptoui_v4f32_to_v4i8(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i8: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: packuswb %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i8: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: packuswb %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i8> %ret +} + +define <4 x i1> @strict_vector_fptosi_v4f32_to_v4i1(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptosi_v4f32_to_v4i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptosi_v4f32_to_v4i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptosi_v4f32_to_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptosi_v4f32_to_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i1> %ret +} + +define <4 x i1> @strict_vector_fptoui_v4f32_to_v4i1(<4 x float> %a) #0 { +; SSE-32-LABEL: strict_vector_fptoui_v4f32_to_v4i1: +; SSE-32: # %bb.0: +; SSE-32-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-32-NEXT: retl +; +; SSE-64-LABEL: strict_vector_fptoui_v4f32_to_v4i1: +; SSE-64: # %bb.0: +; SSE-64-NEXT: cvttps2dq %xmm0, %xmm0 +; SSE-64-NEXT: retq +; +; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i1: +; AVX: # %bb.0: +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: ret{{[l|q]}} +; +; AVX512VL-LABEL: strict_vector_fptoui_v4f32_to_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i1> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll new file mode 100644 index 0000000000000..bcb002823d9d1 --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -0,0 +1,1366 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -disable-strictnode-mutation < %s -mtriple=i686-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX-32 +; RUN: llc -disable-strictnode-mutation < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64 + + +declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64(<4 x double>, metadata) +declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64(<4 x double>, metadata) +declare <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float>, metadata) +declare <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64(<4 x double>, metadata) +declare <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64(<4 x double>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f64(<4 x double>, metadata) +declare <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f64(<4 x double>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f64(<4 x double>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f64(<4 x double>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f32(<4 x float>, metadata) +declare <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f32(<4 x float>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f64(<4 x double>, metadata) +declare <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f64(<4 x double>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f32(<8 x float>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f32(<8 x float>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f32(<8 x float>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f32(<8 x float>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f32(<8 x float>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f32(<8 x float>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f32(<8 x float>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f32(<8 x float>, metadata) + +define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v4f64_to_v4i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, (%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v4f64_to_v4i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptosi_v4f64_to_v4i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm0, (%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v4f64_to_v4i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-64-NEXT: retq +; +; FIXME: This is an unsafe behavior for strict FP +; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i64> %ret +} + +define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v4f64_to_v4i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB1_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: .LBB1_2: +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX-32-NEXT: vxorpd %xmm5, %xmm5, %xmm5 +; AVX-32-NEXT: jb .LBB1_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX-32-NEXT: .LBB1_4: +; AVX-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX-32-NEXT: vmovsd %xmm4, (%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX-32-NEXT: vxorpd %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB1_6 +; AVX-32-NEXT: # %bb.5: +; AVX-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX-32-NEXT: .LBB1_6: +; AVX-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %dl +; AVX-32-NEXT: movzbl %dl, %edx +; AVX-32-NEXT: shll $31, %edx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB1_8 +; AVX-32-NEXT: # %bb.7: +; AVX-32-NEXT: vmovapd %xmm1, %xmm2 +; AVX-32-NEXT: .LBB1_8: +; AVX-32-NEXT: vsubsd %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v4f64_to_v4i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm2 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm2, %xmm3 +; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm3 +; AVX-64-NEXT: vsubsd %xmm3, %xmm2, %xmm3 +; AVX-64-NEXT: vcvttsd2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm2 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm2, %xmm4 +; AVX-64-NEXT: vandnpd %xmm1, %xmm4, %xmm4 +; AVX-64-NEXT: vsubsd %xmm4, %xmm2, %xmm2 +; AVX-64-NEXT: vcvttsd2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnpd %xmm1, %xmm3, %xmm3 +; AVX-64-NEXT: vsubsd %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttsd2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomisd %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltsd %xmm1, %xmm0, %xmm4 +; AVX-64-NEXT: vandnpd %xmm1, %xmm4, %xmm1 +; AVX-64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptoui_v4f64_to_v4i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %esi +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm2 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-32-NEXT: leal -8(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %ebx +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v4f64_to_v4i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-64-NEXT: retq +; +; FIXME: This is an unsafe behavior for strict FP +; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i64> %ret +} + +define <4 x i64> @strict_vector_fptosi_v4f32_to_v4i64(<4 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v4f32_to_v4i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $3, %xmm0, (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX-64-NEXT: vcvttss2si %xmm1, %rax +; AVX-64-NEXT: vmovq %rax, %xmm1 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX-64-NEXT: vcvttss2si %xmm2, %rax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm2 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vmovq %rax, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptosi_v4f32_to_v4i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: vmovd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $3, %xmm0, (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptosi_v4f32_to_v4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i64> %ret +} + +define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v4f32_to_v4i64: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB3_2 +; AVX-32-NEXT: # %bb.1: +; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: .LBB3_2: +; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %al +; AVX-32-NEXT: movzbl %al, %eax +; AVX-32-NEXT: shll $31, %eax +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] +; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB3_4 +; AVX-32-NEXT: # %bb.3: +; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: .LBB3_4: +; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, (%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX-32-NEXT: vxorps %xmm4, %xmm4, %xmm4 +; AVX-32-NEXT: jb .LBB3_6 +; AVX-32-NEXT: # %bb.5: +; AVX-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX-32-NEXT: .LBB3_6: +; AVX-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: setae %dl +; AVX-32-NEXT: movzbl %dl, %edx +; AVX-32-NEXT: shll $31, %edx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX-32-NEXT: jb .LBB3_8 +; AVX-32-NEXT: # %bb.7: +; AVX-32-NEXT: vmovaps %xmm1, %xmm2 +; AVX-32-NEXT: .LBB3_8: +; AVX-32-NEXT: vsubss %xmm2, %xmm0, %xmm0 +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX-32-NEXT: setae %cl +; AVX-32-NEXT: movzbl %cl, %ecx +; AVX-32-NEXT: shll $31, %ecx +; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; AVX-32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm2 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm2, %xmm3 +; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm3 +; AVX-64-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vcvttss2si %xmm2, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm3 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm3, %xmm4 +; AVX-64-NEXT: vandnps %xmm1, %xmm4, %xmm4 +; AVX-64-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX-64-NEXT: vcvttss2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm3 +; AVX-64-NEXT: vandnps %xmm1, %xmm3, %xmm3 +; AVX-64-NEXT: vsubss %xmm3, %xmm0, %xmm3 +; AVX-64-NEXT: vcvttss2si %xmm3, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm3 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-64-NEXT: xorl %eax, %eax +; AVX-64-NEXT: vucomiss %xmm1, %xmm0 +; AVX-64-NEXT: setae %al +; AVX-64-NEXT: shlq $63, %rax +; AVX-64-NEXT: vcmpltss %xmm1, %xmm0, %xmm4 +; AVX-64-NEXT: vandnps %xmm1, %xmm4, %xmm1 +; AVX-64-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: xorq %rax, %rcx +; AVX-64-NEXT: vmovq %rcx, %xmm0 +; AVX-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; AVX512VL-32-LABEL: strict_vector_fptoui_v4f32_to_v4i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %esi +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $32, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -16 +; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setb %cl +; AVX512VL-32-NEXT: kmovw %ecx, %k1 +; AVX512VL-32-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setb %dl +; AVX512VL-32-NEXT: kmovw %edx, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, (%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm2 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: xorl %ebx, %ebx +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovss %xmm3, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: setae %bl +; AVX512VL-32-NEXT: shll $31, %ebx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ebx +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %ebx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-32-NEXT: leal -8(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %ebx +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f32_to_v4i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32(<4 x float> %a, + metadata !"fpexcept.strict") + ret <4 x i64> %ret +} + +define <4 x i32> @strict_vector_fptosi_v4f64_to_v4i32(<4 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i32> %ret +} + +define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v4f64_to_v4i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $32, %esp +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vmovhps %xmm0, (%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: fldl (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, (%esp), %xmm0, %xmm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v4f64_to_v4i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX-64-NEXT: vcvttsd2si %xmm0, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm1 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX-64-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v4f64_to_v4i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm1, %eax +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm1 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512VL-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512VL-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttsd2usi %xmm1, %eax +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm1 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttsd2usi %xmm0, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i32> @llvm.experimental.constrained.fptoui.v4i32.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i32> %ret +} + +define <4 x i16> @strict_vector_fptosi_v4f64_to_v4i16(<4 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i16> @llvm.experimental.constrained.fptosi.v4i16.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i16> %ret +} + +define <4 x i16> @strict_vector_fptoui_v4f64_to_v4i16(<4 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f64_to_v4i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i16> @llvm.experimental.constrained.fptoui.v4i16.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i16> %ret +} + +define <4 x i8> @strict_vector_fptosi_v4f64_to_v4i8(<4 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v4f64_to_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptosi.v4i8.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i8> %ret +} + +define <4 x i8> @strict_vector_fptoui_v4f64_to_v4i8(<4 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v4f64_to_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <4 x i8> @llvm.experimental.constrained.fptoui.v4i8.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i8> %ret +} + +define <4 x i1> @strict_vector_fptosi_v4f64_to_v4i1(<4 x double> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v4f64_to_v4i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v4f64_to_v4i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v4f64_to_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v4f64_to_v4i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptosi.v4i1.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i1> %ret +} + +define <4 x i1> @strict_vector_fptoui_v4f64_to_v4i1(<4 x double> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v4f64_to_v4i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v4f64_to_v4i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v4f64_to_v4i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v4f64_to_v4i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %ymm0, %xmm0 +; AVX512DQ-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <4 x i1> @llvm.experimental.constrained.fptoui.v4i1.v4f64(<4 x double> %a, + metadata !"fpexcept.strict") + ret <4 x i1> %ret +} + +define <8 x i32> @strict_vector_fptosi_v8f32_to_v8i32(<8 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f32_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i32> %ret +} + +define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v8f32_to_v8i32: +; AVX-32: # %bb.0: +; AVX-32-NEXT: pushl %ebp +; AVX-32-NEXT: .cfi_def_cfa_offset 8 +; AVX-32-NEXT: .cfi_offset %ebp, -8 +; AVX-32-NEXT: movl %esp, %ebp +; AVX-32-NEXT: .cfi_def_cfa_register %ebp +; AVX-32-NEXT: andl $-8, %esp +; AVX-32-NEXT: subl $64, %esp +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX-32-NEXT: vextractps $3, %xmm0, (%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX-32-NEXT: flds (%esp) +; AVX-32-NEXT: fisttpll (%esp) +; AVX-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vpinsrd $3, (%esp), %xmm1, %xmm1 +; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-32-NEXT: movl %ebp, %esp +; AVX-32-NEXT: popl %ebp +; AVX-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v8f32_to_v8i32: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm2, %rax +; AVX-64-NEXT: vcvttss2si %xmm1, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm2 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-64-NEXT: vcvttss2si %xmm3, %rax +; AVX-64-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX-64-NEXT: vcvttss2si %xmm1, %rax +; AVX-64-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX-64-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX-64-NEXT: vcvttss2si %xmm2, %rax +; AVX-64-NEXT: vcvttss2si %xmm0, %rcx +; AVX-64-NEXT: vmovd %ecx, %xmm2 +; AVX-64-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX-64-NEXT: vcvttss2si %xmm3, %rax +; AVX-64-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX-64-NEXT: vcvttss2si %xmm0, %rax +; AVX-64-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm2, %eax +; AVX512VL-NEXT: vcvttss2usi %xmm1, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm2 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm3, %eax +; AVX512VL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm1, %eax +; AVX512VL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512VL-NEXT: vcvttss2usi %xmm2, %eax +; AVX512VL-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512VL-NEXT: vmovd %ecx, %xmm2 +; AVX512VL-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-NEXT: vcvttss2usi %xmm3, %eax +; AVX512VL-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512VL-NEXT: vcvttss2usi %xmm0, %eax +; AVX512VL-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i32: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm2, %eax +; AVX512DQ-NEXT: vcvttss2usi %xmm1, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm2 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512DQ-NEXT: vcvttss2usi %xmm3, %eax +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm1, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; AVX512DQ-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm2, %eax +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %ecx +; AVX512DQ-NEXT: vmovd %ecx, %xmm2 +; AVX512DQ-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512DQ-NEXT: vcvttss2usi %xmm3, %eax +; AVX512DQ-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; AVX512DQ-NEXT: vcvttss2usi %xmm0, %eax +; AVX512DQ-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i32> %ret +} + +define <8 x i16> @strict_vector_fptosi_v8f32_to_v8i16(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v8f32_to_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v8f32_to_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i16> %ret +} + +define <8 x i16> @strict_vector_fptoui_v8f32_to_v8i16(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v8f32_to_v8i16: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v8f32_to_v8i16: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i16> %ret +} + +define <8 x i8> @strict_vector_fptosi_v8f32_to_v8i8(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v8f32_to_v8i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v8f32_to_v8i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i8> %ret +} + +define <8 x i8> @strict_vector_fptoui_v8f32_to_v8i8(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v8f32_to_v8i8: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v8f32_to_v8i8: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i8> %ret +} + +define <8 x i1> @strict_vector_fptosi_v8f32_to_v8i1(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptosi_v8f32_to_v8i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptosi_v8f32_to_v8i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptosi_v8f32_to_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i1> %ret +} + +define <8 x i1> @strict_vector_fptoui_v8f32_to_v8i1(<8 x float> %a) #0 { +; AVX-32-LABEL: strict_vector_fptoui_v8f32_to_v8i1: +; AVX-32: # %bb.0: +; AVX-32-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-32-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-32-NEXT: vzeroupper +; AVX-32-NEXT: retl +; +; AVX-64-LABEL: strict_vector_fptoui_v8f32_to_v8i1: +; AVX-64: # %bb.0: +; AVX-64-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX-64-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX-64-NEXT: vzeroupper +; AVX-64-NEXT: retq +; +; AVX512VL-LABEL: strict_vector_fptoui_v8f32_to_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %ymm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i1> %ret +} + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll new file mode 100644 index 0000000000000..26806db74a1ed --- /dev/null +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -0,0 +1,929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512vl -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512VL,AVX512VL-64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512dq -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,AVX512DQ,AVX512DQ-64 + +declare <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f64(<8 x double>, metadata) +declare <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f64(<8 x double>, metadata) +declare <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f32(<8 x float>, metadata) +declare <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f32(<8 x float>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f64(<8 x double>, metadata) +declare <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f64(<8 x double>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f64(<8 x double>, metadata) +declare <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f64(<8 x double>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f64(<8 x double>, metadata) +declare <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f64(<8 x double>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f64(<8 x double>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f64(<8 x double>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f32(<8 x float>, metadata) +declare <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f32(<8 x float>, metadata) + +declare <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f32(<16 x float>, metadata) +declare <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f32(<16 x float>, metadata) +declare <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f32(<16 x float>, metadata) +declare <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f32(<16 x float>, metadata) +declare <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f32(<16 x float>, metadata) +declare <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f32(<16 x float>, metadata) +declare <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f32(<16 x float>, metadata) +declare <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f32(<16 x float>, metadata) + +define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 { +; AVX512VL-32-LABEL: strict_vector_fptosi_v8f64_to_v8i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $64, %esp +; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm1, (%esp) +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v8f64_to_v8i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i64> %ret +} + +define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 { +; AVX512VL-32-LABEL: strict_vector_fptoui_v8f64_to_v8i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %edi +; AVX512VL-32-NEXT: pushl %esi +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $80, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -20 +; AVX512VL-32-NEXT: .cfi_offset %edi, -16 +; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: vextractf32x4 $3, %zmm0, %xmm3 +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm4 = xmm3[1,0] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm4, (%esp) +; AVX512VL-32-NEXT: fldl (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovapd %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: vucomisd %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovsd %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: movzbl %al, %eax +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-32-NEXT: leal -12(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %edi +; AVX512VL-32-NEXT: popl %ebx +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v8f64_to_v8i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf32x4 $3, %zmm0, %xmm1 +; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512VL-64-NEXT: vextractf32x4 $2, %zmm0, %xmm2 +; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i64> %ret +} + +define <8 x i64> @strict_vector_fptosi_v8f32_to_v8i64(<8 x float> %a) #0 { +; AVX512VL-32-LABEL: strict_vector_fptosi_v8f32_to_v8i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $64, %esp +; AVX512VL-32-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512VL-32-NEXT: vmovd %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $1, %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $3, %xmm1, (%esp) +; AVX512VL-32-NEXT: vmovd %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $2, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-32-NEXT: movl %ebp, %esp +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptosi_v8f32_to_v8i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttss2si %xmm3, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttss2si %xmm3, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2si %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f32_to_v8i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptosi.v8i64.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i64> %ret +} + +define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 { +; AVX512VL-32-LABEL: strict_vector_fptoui_v8f32_to_v8i64: +; AVX512VL-32: # %bb.0: +; AVX512VL-32-NEXT: pushl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512VL-32-NEXT: .cfi_offset %ebp, -8 +; AVX512VL-32-NEXT: movl %esp, %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512VL-32-NEXT: pushl %ebx +; AVX512VL-32-NEXT: pushl %edi +; AVX512VL-32-NEXT: pushl %esi +; AVX512VL-32-NEXT: andl $-8, %esp +; AVX512VL-32-NEXT: subl $80, %esp +; AVX512VL-32-NEXT: .cfi_offset %esi, -20 +; AVX512VL-32-NEXT: .cfi_offset %edi, -16 +; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3] +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %edi +; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX512VL-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: movl $0, %eax +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: movl %eax, %esi +; AVX512VL-32-NEXT: xorl %edx, %edx +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm4, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %dl +; AVX512VL-32-NEXT: shll $31, %edx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %edx +; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[3,1,2,3] +; AVX512VL-32-NEXT: xorl %ecx, %ecx +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm4 +; AVX512VL-32-NEXT: setb %al +; AVX512VL-32-NEXT: kmovw %eax, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm5 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm5, %xmm5 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm5, %xmm4, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm4, (%esp) +; AVX512VL-32-NEXT: flds (%esp) +; AVX512VL-32-NEXT: fisttpll (%esp) +; AVX512VL-32-NEXT: setae %cl +; AVX512VL-32-NEXT: shll $31, %ecx +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; AVX512VL-32-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; AVX512VL-32-NEXT: xorl %eax, %eax +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm3 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovaps %xmm1, %xmm4 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm4, %xmm4 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm4, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vmovss %xmm3, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: vucomiss %xmm1, %xmm0 +; AVX512VL-32-NEXT: setb %bl +; AVX512VL-32-NEXT: kmovw %ebx, %k1 +; AVX512VL-32-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} +; AVX512VL-32-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: flds {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp) +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $2, (%esp), %xmm0, %xmm0 +; AVX512VL-32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edx, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 +; AVX512VL-32-NEXT: vpinsrd $3, %esi, %xmm1, %xmm1 +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %edi, %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm2, %xmm2 +; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm2, %xmm2 # 4-byte Folded Reload +; AVX512VL-32-NEXT: setae %al +; AVX512VL-32-NEXT: movzbl %al, %eax +; AVX512VL-32-NEXT: shll $31, %eax +; AVX512VL-32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; AVX512VL-32-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; AVX512VL-32-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm3, %xmm3 +; AVX512VL-32-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm3, %xmm3 # 4-byte Folded Reload +; AVX512VL-32-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512VL-32-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm1 +; AVX512VL-32-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512VL-32-NEXT: leal -12(%ebp), %esp +; AVX512VL-32-NEXT: popl %esi +; AVX512VL-32-NEXT: popl %edi +; AVX512VL-32-NEXT: popl %ebx +; AVX512VL-32-NEXT: popl %ebp +; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512VL-32-NEXT: retl +; +; AVX512VL-64-LABEL: strict_vector_fptoui_v8f32_to_v8i64: +; AVX512VL-64: # %bb.0: +; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm3, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm1 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm2 +; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm3, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm3 +; AVX512VL-64-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-64-NEXT: vcvttss2usi %xmm0, %rax +; AVX512VL-64-NEXT: vmovq %rax, %xmm0 +; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] +; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512VL-64-NEXT: retq +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f32_to_v8i64: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i64> @llvm.experimental.constrained.fptoui.v8i64.v8f32(<8 x float> %a, + metadata !"fpexcept.strict") + ret <8 x i64> %ret +} + +define <8 x i32> @strict_vector_fptosi_v8f64_to_v8i32(<8 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v8f64_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptosi.v8i32.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i32> %ret +} + +define <8 x i32> @strict_vector_fptoui_v8f64_to_v8i32(<8 x double> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v8f64_to_v8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf32x4 $2, %zmm0, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] +; CHECK-NEXT: vcvttsd2usi %xmm2, %eax +; CHECK-NEXT: vcvttsd2usi %xmm1, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm1 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm2 +; CHECK-NEXT: vcvttsd2usi %xmm2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] +; CHECK-NEXT: vcvttsd2usi %xmm2, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] +; CHECK-NEXT: vcvttsd2usi %xmm2, %eax +; CHECK-NEXT: vcvttsd2usi %xmm0, %ecx +; CHECK-NEXT: vmovd %ecx, %xmm2 +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vcvttsd2usi %xmm0, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vcvttsd2usi %xmm0, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm0 +; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <8 x i32> @llvm.experimental.constrained.fptoui.v8i32.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i32> %ret +} + +define <8 x i16> @strict_vector_fptosi_v8f64_to_v8i16(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptosi.v8i16.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i16> %ret +} + +define <8 x i16> @strict_vector_fptoui_v8f64_to_v8i16(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptoui_v8f64_to_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i16: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i16> @llvm.experimental.constrained.fptoui.v8i16.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i16> %ret +} + +define <8 x i8> @strict_vector_fptosi_v8f64_to_v8i8(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptosi.v8i8.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i8> %ret +} + +define <8 x i8> @strict_vector_fptoui_v8f64_to_v8i8(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptoui_v8f64_to_v8i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i8: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i8> @llvm.experimental.constrained.fptoui.v8i8.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i8> %ret +} + +define <8 x i1> @strict_vector_fptosi_v8f64_to_v8i1(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptosi_v8f64_to_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v8f64_to_v8i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptosi.v8i1.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i1> %ret +} + +define <8 x i1> @strict_vector_fptoui_v8f64_to_v8i1(<8 x double> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptoui_v8f64_to_v8i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 +; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v8f64_to_v8i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 +; AVX512DQ-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <8 x i1> @llvm.experimental.constrained.fptoui.v8i1.v8f64(<8 x double> %a, + metadata !"fpexcept.strict") + ret <8 x i1> %ret +} + +define <16 x i32> @strict_vector_fptosi_v16f32_to_v16i32(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f32_to_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i32> @llvm.experimental.constrained.fptosi.v16i32.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i32> %ret +} + +define <16 x i32> @strict_vector_fptoui_v16f32_to_v16i32(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f32_to_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udq %zmm0, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i32> @llvm.experimental.constrained.fptoui.v16i32.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i32> %ret +} + +define <16 x i16> @strict_vector_fptosi_v16f32_to_v16i16(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f32_to_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i16> @llvm.experimental.constrained.fptosi.v16i16.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i16> %ret +} + +define <16 x i16> @strict_vector_fptoui_v16f32_to_v16i16(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f32_to_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vpmovdw %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i16> @llvm.experimental.constrained.fptoui.v16i16.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i16> %ret +} + +define <16 x i8> @strict_vector_fptosi_v16f32_to_v16i8(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptosi_v16f32_to_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i8> @llvm.experimental.constrained.fptosi.v16i8.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i8> %ret +} + +define <16 x i8> @strict_vector_fptoui_v16f32_to_v16i8(<16 x float> %a) #0 { +; CHECK-LABEL: strict_vector_fptoui_v16f32_to_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 +; CHECK-NEXT: vpmovdb %zmm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: ret{{[l|q]}} + %ret = call <16 x i8> @llvm.experimental.constrained.fptoui.v16i8.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i8> %ret +} + +define <16 x i1> @strict_vector_fptosi_v16f32_to_v16i1(<16 x float> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptosi_v16f32_to_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptosi_v16f32_to_v16i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <16 x i1> @llvm.experimental.constrained.fptosi.v16i1.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i1> %ret +} + +define <16 x i1> @strict_vector_fptoui_v16f32_to_v16i1(<16 x float> %a) #0 { +; AVX512VL-LABEL: strict_vector_fptoui_v16f32_to_v16i1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: ret{{[l|q]}} +; +; AVX512DQ-LABEL: strict_vector_fptoui_v16f32_to_v16i1: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vcvttps2dq %zmm0, %zmm0 +; AVX512DQ-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 +; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: ret{{[l|q]}} + %ret = call <16 x i1> @llvm.experimental.constrained.fptoui.v16i1.v16f32(<16 x float> %a, + metadata !"fpexcept.strict") + ret <16 x i1> %ret +} + + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 27bc9c13d4d8e..40ff465a9ddf9 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -3922,29 +3922,12 @@ entry: define <4 x i32> @constrained_vector_fptosi_v4i32_v4f32() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: cvttss2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvttps2dq {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttss2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttps2dq {{.*}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f32( @@ -4065,20 +4048,27 @@ define <4 x i64> @constrained_vector_fptosi_v4i64_v4f32() #0 { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: constrained_vector_fptosi_v4i64_v4f32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vcvttss2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f32: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq entry: %result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f32( <4 x float> @constrained_vector_fptosi_v2i32_v2f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v2i32_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v2i32_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dqx {{.*}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <2 x i32> @llvm.experimental.constrained.fptosi.v2i32.v2f64( @@ -4162,29 +4145,14 @@ entry: define <4 x i32> @constrained_vector_fptosi_v4i32_v4f64() #0 { ; CHECK-LABEL: constrained_vector_fptosi_v4i32_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm1 -; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm2 -; CHECK-NEXT: cvttsd2si {{.*}}(%rip), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm1 +; CHECK-NEXT: cvttpd2dq {{.*}}(%rip), %xmm0 +; CHECK-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; ; AVX-LABEL: constrained_vector_fptosi_v4i32_v4f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %eax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvttpd2dqy {{.*}}(%rip), %xmm0 ; AVX-NEXT: retq entry: %result = call <4 x i32> @llvm.experimental.constrained.fptosi.v4i32.v4f64( @@ -4221,14 +4189,31 @@ define <2 x i64> @constrained_vector_fptosi_v2i64_v2f64() #0 { ; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; CHECK-NEXT: retq ; -; AVX-LABEL: constrained_vector_fptosi_v2i64_v2f64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX-NEXT: vmovq %rax, %xmm1 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX-NEXT: retq +; AVX1-LABEL: constrained_vector_fptosi_v2i64_v2f64: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vmovq %rax, %xmm0 +; AVX1-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX1-NEXT: vmovq %rax, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX1-NEXT: retq +; +; AVX512F-LABEL: constrained_vector_fptosi_v2i64_v2f64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptosi_v2i64_v2f64: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq entry: %result = call <2 x i64> @llvm.experimental.constrained.fptosi.v2i64.v2f64( <2 x double>, @@ -4305,20 +4290,27 @@ define <4 x i64> @constrained_vector_fptosi_v4i64_v4f64() #0 { ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: constrained_vector_fptosi_v4i64_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttsd2si {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constrained_vector_fptosi_v4i64_v4f64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vcvttsd2si {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptosi_v4i64_v4f64: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] +; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq entry: %result = call <4 x i64> @llvm.experimental.constrained.fptosi.v4i64.v4f64( <4 x double> @constrained_vector_fptoui_v4i64_v4f32() #0 { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: constrained_vector_fptoui_v4i64_v4f32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttss2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f32: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vcvttss2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f32: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,4.4E+1,4.5E+1] +; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq entry: %result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f32( <4 x float> @constrained_vector_fptoui_v2i64_v2f64() #0 { ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: retq ; -; AVX512-LABEL: constrained_vector_fptoui_v2i64_v2f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: retq +; AVX512F-LABEL: constrained_vector_fptoui_v2i64_v2f64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptoui_v2i64_v2f64: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} xmm0 = [4.2100000000000001E+1,4.2200000000000003E+1] +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq entry: %result = call <2 x i64> @llvm.experimental.constrained.fptoui.v2i64.v2f64( <2 x double>, @@ -4981,20 +4988,27 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 { ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: constrained_vector_fptoui_v4i64_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm1 -; AVX512-NEXT: vcvttsd2usi {{.*}}(%rip), %rax -; AVX512-NEXT: vmovq %rax, %xmm2 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: constrained_vector_fptoui_v4i64_v4f64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm0 +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm1 +; AVX512F-NEXT: vcvttsd2usi {{.*}}(%rip), %rax +; AVX512F-NEXT: vmovq %rax, %xmm2 +; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512DQ-LABEL: constrained_vector_fptoui_v4i64_v4f64: +; AVX512DQ: # %bb.0: # %entry +; AVX512DQ-NEXT: vmovaps {{.*#+}} ymm0 = [4.2100000000000001E+1,4.2200000000000003E+1,4.2299999999999997E+1,4.2399999999999999E+1] +; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq entry: %result = call <4 x i64> @llvm.experimental.constrained.fptoui.v4i64.v4f64( <4 x double>