diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 798e6a1d9525e2..de7bf26868b342 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -614,6 +614,12 @@ def strict_sint_to_fp : SDNode<"ISD::STRICT_SINT_TO_FP", SDTIntToFPOp, [SDNPHasChain]>; def strict_uint_to_fp : SDNode<"ISD::STRICT_UINT_TO_FP", SDTIntToFPOp, [SDNPHasChain]>; + +def strict_f16_to_fp : SDNode<"ISD::STRICT_FP16_TO_FP", + SDTIntToFPOp, [SDNPHasChain]>; +def strict_fp_to_f16 : SDNode<"ISD::STRICT_FP_TO_FP16", + SDTFPToIntOp, [SDNPHasChain]>; + def strict_fsetcc : SDNode<"ISD::STRICT_FSETCC", SDTSetCC, [SDNPHasChain]>; def strict_fsetccs : SDNode<"ISD::STRICT_FSETCCS", SDTSetCC, [SDNPHasChain]>; @@ -1558,6 +1564,13 @@ def any_fsetccs : PatFrags<(ops node:$lhs, node:$rhs, node:$pred), [(strict_fsetccs node:$lhs, node:$rhs, node:$pred), (setcc node:$lhs, node:$rhs, node:$pred)]>; +def any_f16_to_fp : PatFrags<(ops node:$src), + [(f16_to_fp node:$src), + (strict_f16_to_fp node:$src)]>; +def any_fp_to_f16 : PatFrags<(ops node:$src), + [(fp_to_f16 node:$src), + (strict_fp_to_f16 node:$src)]>; + multiclass binary_atomic_op_ord { def NAME#_monotonic : PatFrag<(ops node:$ptr, node:$val), (!cast(NAME) node:$ptr, node:$val)> { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 630aa4a07d7b94..3b8fadf41be669 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -522,8 +522,11 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { Op = GetPromotedFloat(Op); // If the promotion did the FP_EXTEND to the destination type for us, // there's nothing left to do here. - if (Op.getValueType() == N->getValueType(0)) + if (Op.getValueType() == N->getValueType(0)) { + if (IsStrict) + ReplaceValueWith(SDValue(N, 1), Chain); return BitConvertToInteger(Op); + } } // There's only a libcall for f16 -> f32 and shifting is only valid for bf16 @@ -541,8 +544,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) { } } - if (Op.getValueType() == MVT::bf16) + if (Op.getValueType() == MVT::bf16) { + // FIXME: Need ReplaceValueWith on chain in strict case return SoftenFloatRes_BF16_TO_FP(N); + } RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0)); assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!"); @@ -2181,6 +2186,24 @@ static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) { report_fatal_error("Attempt at an invalid promotion-related conversion"); } +static ISD::NodeType GetPromotionOpcodeStrict(EVT OpVT, EVT RetVT) { + if (OpVT == MVT::f16) + return ISD::STRICT_FP16_TO_FP; + + if (RetVT == MVT::f16) + return ISD::STRICT_FP_TO_FP16; + + if (OpVT == MVT::bf16) { + // TODO: return ISD::STRICT_BF16_TO_FP; + } + + if (RetVT == MVT::bf16) { + // TODO: return ISD::STRICT_FP_TO_BF16; + } + + report_fatal_error("Attempt at an invalid promotion-related conversion"); +} + bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { LLVM_DEBUG(dbgs() << "Promote float operand " << OpNo << ": "; N->dump(&DAG)); SDValue R = SDValue(); @@ -2214,6 +2237,9 @@ bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) { case ISD::FP_TO_UINT_SAT: R = PromoteFloatOp_FP_TO_XINT_SAT(N, OpNo); break; case ISD::FP_EXTEND: R = PromoteFloatOp_FP_EXTEND(N, OpNo); break; + case ISD::STRICT_FP_EXTEND: + R = PromoteFloatOp_STRICT_FP_EXTEND(N, OpNo); + break; case ISD::SELECT_CC: R = PromoteFloatOp_SELECT_CC(N, OpNo); break; case ISD::SETCC: R = PromoteFloatOp_SETCC(N, OpNo); break; case ISD::STORE: R = PromoteFloatOp_STORE(N, OpNo); break; @@ -2276,6 +2302,26 @@ SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) { return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op); } +SDValue DAGTypeLegalizer::PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, + unsigned OpNo) { + assert(OpNo == 1 && "Promoting unpromotable operand"); + + SDValue Op = GetPromotedFloat(N->getOperand(1)); + EVT VT = N->getValueType(0); + + // Desired VT is same as promoted type. Use promoted float directly. + if (VT == Op->getValueType(0)) { + ReplaceValueWith(SDValue(N, 1), N->getOperand(0)); + return Op; + } + + // Else, extend the promoted float value to the desired VT. + SDValue Res = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N), N->getVTList(), + N->getOperand(0), Op); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + // Promote the float operands used for comparison. The true- and false- // operands have the same type as the result and are promoted, if needed, by // PromoteFloatRes_SELECT_CC @@ -2393,12 +2439,16 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FFREXP: R = PromoteFloatRes_FFREXP(N); break; case ISD::FP_ROUND: R = PromoteFloatRes_FP_ROUND(N); break; + case ISD::STRICT_FP_ROUND: + R = PromoteFloatRes_STRICT_FP_ROUND(N); + break; case ISD::LOAD: R = PromoteFloatRes_LOAD(N); break; case ISD::SELECT: R = PromoteFloatRes_SELECT(N); break; case ISD::SELECT_CC: R = PromoteFloatRes_SELECT_CC(N); break; case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break; + case ISD::STRICT_SINT_TO_FP: R = PromoteFloatRes_STRICT_XINT_TO_FP(N); break; case ISD::UNDEF: R = PromoteFloatRes_UNDEF(N); break; case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break; case ISD::VECREDUCE_FADD: @@ -2598,6 +2648,29 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) { return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round); } +// Explicit operation to reduce precision. Reduce the value to half precision +// and promote it back to the legal type. +SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_FP_ROUND(SDNode *N) { + SDLoc DL(N); + + SDValue Chain = N->getOperand(0); + SDValue Op = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT OpVT = Op->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); + + // Round promoted float to desired precision + SDValue Round = DAG.getNode(GetPromotionOpcodeStrict(OpVT, VT), DL, + DAG.getVTList(IVT, MVT::Other), Chain, Op); + // Promote it back to the legal output type + SDValue Res = + DAG.getNode(GetPromotionOpcodeStrict(VT, NVT), DL, + DAG.getVTList(NVT, MVT::Other), Round.getValue(1), Round); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) { LoadSDNode *L = cast(N); EVT VT = N->getValueType(0); @@ -2651,6 +2724,23 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) { DAG.getIntPtrConstant(0, DL, /*isTarget=*/true))); } +// Construct a SDNode that transforms the SINT or UINT operand to the promoted +// float type. +SDValue DAGTypeLegalizer::PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDVTList NVTs = DAG.getVTList(NVT, MVT::Other); + + SDValue NV = DAG.getNode(N->getOpcode(), DL, NVTs, N->getOperand(0), N->getOperand(1)); + + // Round the value to the desired precision (that of the source type). + SDValue Rounded = DAG.getNode(ISD::STRICT_FP_ROUND, DL, N->getVTList(), NV.getValue(1), NV, + DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)); + return DAG.getNode( + ISD::STRICT_FP_EXTEND, DL, NVTs, Rounded.getValue(1), Rounded.getValue(0)); +} + SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) { return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0))); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 54698edce7d6f8..887670fb6baff3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -165,7 +165,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::FP_TO_FP16: Res = PromoteIntRes_FP_TO_FP16_BF16(N); break; - + case ISD::STRICT_FP_TO_FP16: + Res = PromoteIntRes_STRICT_FP_TO_FP16_BF16(N); + break; case ISD::GET_ROUNDING: Res = PromoteIntRes_GET_ROUNDING(N); break; case ISD::AND: @@ -787,6 +789,16 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16_BF16(SDNode *N) { return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0)); } +SDValue DAGTypeLegalizer::PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N) { + EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + SDLoc dl(N); + + SDValue Res = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(NVT, MVT::Other), + N->getOperand(0), N->getOperand(1)); + ReplaceValueWith(SDValue(N, 1), Res.getValue(1)); + return Res; +} + SDValue DAGTypeLegalizer::PromoteIntRes_XRINT(SDNode *N) { EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); SDLoc dl(N); @@ -1804,6 +1816,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) { case ISD::FP16_TO_FP: case ISD::VP_UINT_TO_FP: case ISD::UINT_TO_FP: Res = PromoteIntOp_UINT_TO_FP(N); break; + case ISD::STRICT_FP16_TO_FP: case ISD::STRICT_UINT_TO_FP: Res = PromoteIntOp_STRICT_UINT_TO_FP(N); break; case ISD::ZERO_EXTEND: Res = PromoteIntOp_ZERO_EXTEND(N); break; case ISD::VP_ZERO_EXTEND: Res = PromoteIntOp_VP_ZERO_EXTEND(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index e9bd54089d0627..26c92c9e927bbe 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -326,6 +326,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT_SAT(SDNode *N); SDValue PromoteIntRes_FP_TO_FP16_BF16(SDNode *N); + SDValue PromoteIntRes_STRICT_FP_TO_FP16_BF16(SDNode *N); SDValue PromoteIntRes_XRINT(SDNode *N); SDValue PromoteIntRes_FREEZE(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); @@ -698,6 +699,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatRes_ExpOp(SDNode *N); SDValue PromoteFloatRes_FFREXP(SDNode *N); SDValue PromoteFloatRes_FP_ROUND(SDNode *N); + SDValue PromoteFloatRes_STRICT_FP_ROUND(SDNode *N); SDValue PromoteFloatRes_LOAD(SDNode *N); SDValue PromoteFloatRes_SELECT(SDNode *N); SDValue PromoteFloatRes_SELECT_CC(SDNode *N); @@ -705,6 +707,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatRes_UNDEF(SDNode *N); SDValue BitcastToInt_ATOMIC_SWAP(SDNode *N); SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N); + SDValue PromoteFloatRes_STRICT_XINT_TO_FP(SDNode *N); SDValue PromoteFloatRes_VECREDUCE(SDNode *N); SDValue PromoteFloatRes_VECREDUCE_SEQ(SDNode *N); @@ -712,6 +715,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo); + SDValue PromoteFloatOp_STRICT_FP_EXTEND(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_UnaryOp(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_FP_TO_XINT_SAT(SDNode *N, unsigned OpNo); SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index da7d9ace4114a6..29684d3372bdba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -153,6 +153,7 @@ static const unsigned MaxParallelChains = 64; static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional CC); /// getCopyFromParts - Create a value that contains the specified legal parts @@ -163,6 +164,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional CC = std::nullopt, std::optional AssertOp = std::nullopt) { // Let the target assemble the parts if it wants to @@ -173,7 +175,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, if (ValueVT.isVector()) return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V, - CC); + InChain, CC); assert(NumParts > 0 && "No parts to assemble!"); SDValue Val = Parts[0]; @@ -194,10 +196,10 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2); if (RoundParts > 2) { - Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, - PartVT, HalfVT, V); - Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, - RoundParts / 2, PartVT, HalfVT, V); + Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2, PartVT, HalfVT, V, + InChain); + Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2, RoundParts / 2, + PartVT, HalfVT, V, InChain); } else { Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]); Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]); @@ -213,7 +215,7 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned OddParts = NumParts - RoundParts; EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits); Hi = getCopyFromParts(DAG, DL, Parts + RoundParts, OddParts, PartVT, - OddVT, V, CC); + OddVT, V, InChain, CC); // Combine the round and odd parts. Lo = Val; @@ -243,7 +245,8 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, assert(ValueVT.isFloatingPoint() && PartVT.isInteger() && !PartVT.isVector() && "Unexpected split"); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits()); - Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, CC); + Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V, + InChain, CC); } } @@ -283,10 +286,20 @@ getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) { // FP_ROUND's are always exact here. - if (ValueVT.bitsLT(Val.getValueType())) - return DAG.getNode( - ISD::FP_ROUND, DL, ValueVT, Val, - DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout()))); + if (ValueVT.bitsLT(Val.getValueType())) { + + SDValue NoChange = + DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())); + + if (DAG.getMachineFunction().getFunction().getAttributes().hasFnAttr( + llvm::Attribute::StrictFP)) { + return DAG.getNode(ISD::STRICT_FP_ROUND, DL, + DAG.getVTList(ValueVT, MVT::Other), InChain, Val, + NoChange); + } + + return DAG.getNode(ISD::FP_ROUND, DL, ValueVT, Val, NoChange); + } return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val); } @@ -324,6 +337,7 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V, static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, MVT PartVT, EVT ValueVT, const Value *V, + SDValue InChain, std::optional CallConv) { assert(ValueVT.isVector() && "Not a vector value"); assert(NumParts > 0 && "No parts to assemble!"); @@ -362,8 +376,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // If the register was not expanded, truncate or copy the value, // as appropriate. for (unsigned i = 0; i != NumParts; ++i) - Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, - PartVT, IntermediateVT, V, CallConv); + Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1, PartVT, IntermediateVT, + V, InChain, CallConv); } else if (NumParts > 0) { // If the intermediate type was expanded, build the intermediate // operands from the parts. @@ -371,8 +385,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, "Must expand into a divisible number of parts!"); unsigned Factor = NumParts / NumIntermediates; for (unsigned i = 0; i != NumIntermediates; ++i) - Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, - PartVT, IntermediateVT, V, CallConv); + Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor, PartVT, + IntermediateVT, V, InChain, CallConv); } // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the @@ -926,7 +940,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, } Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(), NumRegs, - RegisterVT, ValueVT, V, CallConv); + RegisterVT, ValueVT, V, Chain, CallConv); Part += NumRegs; Parts.clear(); } @@ -10641,9 +10655,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { unsigned NumRegs = getNumRegistersForCallingConv(CLI.RetTy->getContext(), CLI.CallConv, VT); - ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg], - NumRegs, RegisterVT, VT, nullptr, - CLI.CallConv, AssertOp)); + ReturnValues.push_back(getCopyFromParts( + CLI.DAG, CLI.DL, &InVals[CurReg], NumRegs, RegisterVT, VT, nullptr, + CLI.Chain, CLI.CallConv, AssertOp)); CurReg += NumRegs; } @@ -11122,8 +11136,9 @@ void SelectionDAGISel::LowerArguments(const Function &F) { MVT VT = ValueVTs[0].getSimpleVT(); MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT); std::optional AssertOp; - SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, - nullptr, F.getCallingConv(), AssertOp); + SDValue ArgValue = + getCopyFromParts(DAG, dl, &InVals[0], 1, RegVT, VT, nullptr, NewRoot, + F.getCallingConv(), AssertOp); MachineFunction& MF = SDB->DAG.getMachineFunction(); MachineRegisterInfo& RegInfo = MF.getRegInfo(); @@ -11195,7 +11210,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) { AssertOp = ISD::AssertZext; ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts, - PartVT, VT, nullptr, + PartVT, VT, nullptr, NewRoot, F.getCallingConv(), AssertOp)); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index fcbdf51b03c1fc..7d33421e11fbbb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -421,7 +421,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand); setOperationAction( - {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, + {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Expand); @@ -442,7 +442,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, ISD::FP_TO_UINT, ISD::MUL, ISD::MULHU, ISD::MULHS, ISD::OR, ISD::SHL, ISD::SRA, ISD::SRL, ISD::ROTL, - ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, + ISD::ROTR, ISD::SUB, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP, ISD::UINT_TO_FP, ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI, ISD::SDIVREM, ISD::UDIVREM, @@ -1331,7 +1331,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, return lowerFEXP(Op, DAG); case ISD::FEXP2: return lowerFEXP2(Op, DAG); - case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); + case ISD::SINT_TO_FP: + case ISD::STRICT_SINT_TO_FP: + return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); case ISD::FP_TO_SINT: @@ -3097,6 +3099,8 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const { + bool IsStrict = Op->getNumValues() == 2; + // The regular method converting a 64-bit integer to float roughly consists of // 2 steps: normalization and rounding. In fact, after normalization, the // conversion from a 64-bit integer to a float is essentially the same as the @@ -3124,7 +3128,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, // converted instead followed by negation based its sign bit. SDLoc SL(Op); - SDValue Src = Op.getOperand(0); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); SDValue Lo, Hi; std::tie(Lo, Hi) = split64BitValue(Src, DAG); @@ -3200,8 +3204,14 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32), ShAmt); // On GCN, use LDEXP directly. - if (Subtarget->isGCN()) + if (Subtarget->isGCN()) { + if (IsStrict) { + return DAG.getNode(ISD::STRICT_FLDEXP, SL, Op->getVTList(), + Op.getOperand(0), FVal, ShAmt); + } + return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt); + } // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent // part directly to emulate the multiplication of 2^ShAmt. That 8-bit @@ -3224,11 +3234,27 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const { SDLoc SL(Op); - SDValue Src = Op.getOperand(0); + bool IsStrict = Op->getNumValues() == 2; + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); SDValue Lo, Hi; std::tie(Lo, Hi) = split64BitValue(Src, DAG); + if (IsStrict) { + SDVTList VTs = Op->getVTList(); + SDValue CvtHi = DAG.getNode(Signed ? ISD::STRICT_SINT_TO_FP : ISD::STRICT_UINT_TO_FP, + SL, VTs, Op.getOperand(0), Hi); + + SDValue CvtLo = DAG.getNode(ISD::STRICT_UINT_TO_FP, SL, VTs, CvtHi.getValue(1), Lo); + + SDValue LdExp = DAG.getNode(ISD::STRICT_FLDEXP, SL, VTs, + CvtLo.getValue(1), + CvtHi, + DAG.getConstant(32, SL, MVT::i32)); + // TODO: Should this propagate fast-math-flags? + return DAG.getNode(ISD::STRICT_FADD, SL, VTs, LdExp.getValue(1), LdExp, CvtLo); + } + SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP, SL, MVT::f64, Hi); @@ -3262,7 +3288,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { SDLoc DL(Op); - SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, {MVT::f32, MVT::Other}, + Src); SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); SDValue FPRound = @@ -3281,8 +3308,8 @@ SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op, SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT DestVT = Op.getValueType(); - - SDValue Src = Op.getOperand(0); + bool IsStrict = Op.getOpcode() == ISD::STRICT_SINT_TO_FP; + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = Src.getValueType(); if (SrcVT == MVT::i16) { @@ -3292,6 +3319,8 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, SDLoc DL(Op); // Promote src to i32 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src); + if (IsStrict) + return DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, Op->getVTList(), Op.getOperand(0), Ext); return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext); } @@ -3301,13 +3330,23 @@ SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op, if (Subtarget->has16BitInsts() && DestVT == MVT::f16) { SDLoc DL(Op); - SDValue Src = Op.getOperand(0); + SDValue Src = Op.getOperand(IsStrict ? 1 : 0); - SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); SDValue FPRoundFlag = - DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true); + DAG.getIntPtrConstant(0, DL, /*isTarget=*/true); + + if (IsStrict) { + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, DAG.getVTList(MVT::f32, MVT::Other), + Op.getOperand(0), Src); + SDValue FPRound = + DAG.getNode(ISD::STRICT_FP_ROUND, DL, Op->getVTList(), IntToFp32.getValue(1), + IntToFp32, FPRoundFlag); + return FPRound; + } + + SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src); SDValue FPRound = - DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); + DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag); return FPRound; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 53ab5da0135399..db329e3cbe3a3f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -539,12 +539,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND}, MVT::f16, Custom); - setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); + setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP}, MVT::i16, Custom); setOperationAction( - {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP}, + {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::STRICT_SINT_TO_FP, + ISD::UINT_TO_FP}, MVT::f16, Promote); + setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal); + // F16 - VOP2 Actions. setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom); diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 9362fe5d9678b4..581c23cede7d49 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1097,27 +1097,27 @@ def : Pat < multiclass f16_fp_Pats { // f16_to_fp patterns def : GCNPat < - (f32 (f16_to_fp i32:$src0)), + (f32 (any_f16_to_fp i32:$src0)), (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src0) >; def : GCNPat < - (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))), + (f32 (any_f16_to_fp (and_oneuse i32:$src0, 0x7fff))), (cvt_f32_f16_inst_e64 SRCMODS.ABS, $src0) >; def : GCNPat < - (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), + (f32 (any_f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))), (cvt_f32_f16_inst_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0))) >; def : GCNPat < - (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))), + (f32 (any_f16_to_fp (or_oneuse i32:$src0, 0x8000))), (cvt_f32_f16_inst_e64 SRCMODS.NEG_ABS, $src0) >; def : GCNPat < - (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))), + (f32 (any_f16_to_fp (xor_oneuse i32:$src0, 0x8000))), (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0) >; @@ -1143,7 +1143,7 @@ multiclass f16_fp_Pats; def : GCNPat < - (f16 (sint_to_fp i32:$src)), + (f16 (any_sint_to_fp i32:$src)), (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_I32_e32 VSrc_b32:$src)) >; @@ -1151,6 +1151,13 @@ multiclass f16_fp_Pats; + + // This is only used on targets without half support + // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering + def : GCNPat < + (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))), + (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0) + >; } let SubtargetPredicate = NotHasTrue16BitInsts in diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 9e10efd1b07e19..763978178e6d66 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -413,7 +413,7 @@ class SOP1_F32_Inst; + def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>; // xxx - any def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>; let mayRaiseFPException = 1 in { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 53b0513c85d886..72d8f97991729e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -262,7 +262,7 @@ let SchedRW = [WriteDoubleCvt] in { defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_sint>; let mayRaiseFPException = 0 in { -defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>; +defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, any_sint_to_fp>; } defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>; @@ -281,7 +281,7 @@ let SchedRW = [WriteFloatCvt] in { // XXX: Does this really not raise exceptions? The manual claims the // 16-bit ones can. let mayRaiseFPException = 0 in { -defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>; +defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, any_sint_to_fp>; defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>; } @@ -467,11 +467,11 @@ let SubtargetPredicate = isGFX7Plus in { let FPDPRounding = 1 in { let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in { defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>; -defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>; +defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, any_sint_to_fp>; } let OtherPredicates = [HasTrue16BitInsts] in { defm V_CVT_F16_U16_t16 : VOP1Inst <"v_cvt_f16_u16_t16", VOP1_F16_I16_t16, uint_to_fp>; -defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, sint_to_fp>; +defm V_CVT_F16_I16_t16 : VOP1Inst <"v_cvt_f16_i16_t16", VOP1_F16_I16_t16, any_sint_to_fp>; } } // End FPDPRounding = 1 // OMod clears exceptions when set in these two instructions @@ -509,7 +509,7 @@ defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>; let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in { def : GCNPat< - (f32 (f16_to_fp i16:$src)), + (f32 (any_f16_to_fp i16:$src)), (V_CVT_F32_F16_e32 $src) >; def : GCNPat< @@ -519,7 +519,7 @@ def : GCNPat< } let OtherPredicates = [HasTrue16BitInsts] in { def : GCNPat< - (f32 (f16_to_fp i16:$src)), + (f32 (any_f16_to_fp i16:$src)), (V_CVT_F32_F16_t16_e32 $src) >; def : GCNPat< diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll new file mode 100644 index 00000000000000..37186cf22ccc71 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -0,0 +1,344 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck -check-prefix=NOFP16 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { +; NOFP16-LABEL: f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w30, -16 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: str w0, [x19] +; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v2f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 32 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w30, -32 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x2 +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: stp w21, w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v3f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w2, #0xffff +; NOFP16-NEXT: mov x19, x3 +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w8, w21 +; NOFP16-NEXT: // kill: def $w0 killed $w0 def $x0 +; NOFP16-NEXT: str w22, [x19, #8] +; NOFP16-NEXT: orr x8, x8, x0, lsl #32 +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: str x8, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { +; NOFP16-LABEL: v4f16_arg: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov x19, x4 +; NOFP16-NEXT: mov w20, w3 +; NOFP16-NEXT: mov w21, w2 +; NOFP16-NEXT: mov w22, w1 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: stp w21, w0, [x19, #8] +; NOFP16-NEXT: stp w23, w22, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + +; FIXME: +; define half @f16_return(float %arg) #0 { +; %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret half %fptrunc +; } + +; define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <2 x half> %fptrunc +; } + +; define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <3 x half> %fptrunc +; } + +; define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") +; ret <4 x half> %fptrunc +; } + +; FIXME: +; define void @outgoing_f16_arg(ptr %ptr) #0 { +; %val = load half, ptr %ptr +; call void @f16_user(half %val) +; ret void +; } + +; define void @outgoing_v2f16_arg(ptr %ptr) #0 { +; %val = load <2 x half>, ptr %ptr +; call void @v2f16_user(<2 x half> %val) +; ret void +; } + +; define void @outgoing_f16_return(ptr %ptr) #0 { +; %val = call half @f16_result() +; store half %val, ptr %ptr +; ret void +; } + +; define void @outgoing_v2f16_return(ptr %ptr) #0 { +; %val = call <2 x half> @v2f16_result() +; store <2 x half> %val, ptr %ptr +; ret void +; } + +define void @outgoing_v4f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v4f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 48 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w30, -48 +; NOFP16-NEXT: mov x19, x0 +; NOFP16-NEXT: bl v4f16_result +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov w20, w1 +; NOFP16-NEXT: mov w21, w2 +; NOFP16-NEXT: mov w22, w3 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w20, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #6] +; NOFP16-NEXT: mov w0, w21 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #4] +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #2] +; NOFP16-NEXT: mov w0, w23 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %val = call <4 x half> @v4f16_result() + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { +; NOFP16-LABEL: outgoing_v8f16_return: +; NOFP16: // %bb.0: +; NOFP16-NEXT: stp x30, x27, [sp, #-80]! // 16-byte Folded Spill +; NOFP16-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill +; NOFP16-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill +; NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill +; NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 80 +; NOFP16-NEXT: .cfi_offset w19, -8 +; NOFP16-NEXT: .cfi_offset w20, -16 +; NOFP16-NEXT: .cfi_offset w21, -24 +; NOFP16-NEXT: .cfi_offset w22, -32 +; NOFP16-NEXT: .cfi_offset w23, -40 +; NOFP16-NEXT: .cfi_offset w24, -48 +; NOFP16-NEXT: .cfi_offset w25, -56 +; NOFP16-NEXT: .cfi_offset w26, -64 +; NOFP16-NEXT: .cfi_offset w27, -72 +; NOFP16-NEXT: .cfi_offset w30, -80 +; NOFP16-NEXT: mov x19, x0 +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: and w0, w0, #0xffff +; NOFP16-NEXT: mov w21, w1 +; NOFP16-NEXT: mov w22, w2 +; NOFP16-NEXT: mov w23, w3 +; NOFP16-NEXT: mov w24, w4 +; NOFP16-NEXT: mov w25, w5 +; NOFP16-NEXT: mov w26, w6 +; NOFP16-NEXT: mov w27, w7 +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w20, w0 +; NOFP16-NEXT: and w0, w21, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w21, w0 +; NOFP16-NEXT: and w0, w22, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w22, w0 +; NOFP16-NEXT: and w0, w23, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w23, w0 +; NOFP16-NEXT: and w0, w24, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w24, w0 +; NOFP16-NEXT: and w0, w25, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w25, w0 +; NOFP16-NEXT: and w0, w26, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: mov w26, w0 +; NOFP16-NEXT: and w0, w27, #0xffff +; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #14] +; NOFP16-NEXT: mov w0, w26 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #12] +; NOFP16-NEXT: mov w0, w25 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #10] +; NOFP16-NEXT: mov w0, w24 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #8] +; NOFP16-NEXT: mov w0, w23 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #6] +; NOFP16-NEXT: mov w0, w22 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #4] +; NOFP16-NEXT: mov w0, w21 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19, #2] +; NOFP16-NEXT: mov w0, w20 +; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: strh w0, [x19] +; NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload +; NOFP16-NEXT: ldp x30, x27, [sp], #80 // 16-byte Folded Reload +; NOFP16-NEXT: ret + %val = call <8 x half> @v8f16_result() + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +; NOFP16-LABEL: call_split_type_used_outside_block_v8f16: +; NOFP16: // %bb.0: // %bb0 +; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; NOFP16-NEXT: .cfi_def_cfa_offset 16 +; NOFP16-NEXT: .cfi_offset w30, -16 +; NOFP16-NEXT: bl v8f16_result +; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; NOFP16-NEXT: ret +bb0: + %split.ret.type = call <8 x half> @v8f16_result() + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index d74948a460c98b..20d175ecfd9096 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -1316,6 +1316,9 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7SELDAG-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll index 22bebb7ad26f53..fe59a8491c91ab 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll @@ -1,22 +1,47 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; FIXME: Missing operand promote for f16 -; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") ret float %result } define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -45,6 +70,20 @@ define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_fpexcept_strict(<2 x half } define <3 x float> @v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict(<3 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v3f32_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -101,6 +140,16 @@ define <2 x double> @v_constrained_fpext_v2f32_to_v2f64_fpexcept_strict(<2 x flo } define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x float> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v4, v2 +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -125,17 +174,46 @@ define <3 x double> @v_constrained_fpext_v3f32_to_v3f64_fpexcept_strict(<3 x flo } define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call double @llvm.experimental.constrained.fpext.f64.f16(half %arg, metadata !"fpexcept.strict") ret double %result } define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v1 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -168,6 +246,23 @@ define <2 x double> @v_constrained_fpext_v2f16_to_v2f64_fpexcept_strict(<2 x hal } define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x half> %arg) #0 { +; SI-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v3, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v4, v2 +; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v3 +; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX89-LABEL: v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict: ; GFX89: ; %bb.0: ; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -206,23 +301,54 @@ define <3 x double> @v_constrained_fpext_v3f16_to_v2f64_fpexcept_strict(<3 x hal } define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX1011-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") %neg.result = fneg float %result ret float %neg.result } define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0 { -; GCN-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f32_f16_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg half %arg %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %neg.arg, metadata !"fpexcept.strict") ret float %result @@ -251,6 +377,111 @@ define double @v_constrained_fneg_fpext_f32_to_f64_fpexcept_strict(float %arg) # ret double %neg.result } +define float @v_constrained_fpext_f16_to_f32_noabi(ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fpext_f16_to_f32_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_u16 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = load half, ptr addrspace(1) %ptr + %result = call float @llvm.experimental.constrained.fpext.f32.f16(half %val, metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_fpext_v2f16_to_v2f32_noabi(ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: buffer_load_dword v1, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cvt_f32_f16_e32 v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_load_dword v1, v[0:1] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX8-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX9-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX10-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fpext_v2f16_to_v2f32_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v0, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x half>, ptr addrspace(1) %ptr + %result = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %val, metadata !"fpexcept.strict") + ret <2 x float> %result +} + declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata) #1 declare <2 x double> @llvm.experimental.constrained.fpext.v2f64.v2f32(<2 x float>, metadata) #1 declare <3 x double> @llvm.experimental.constrained.fpext.v3f64.v3f32(<3 x float>, metadata) #1 @@ -265,6 +496,3 @@ declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, attributes #0 = { strictfp } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX8: {{.*}} -; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll index ec2bc43ca36787..965040d0d879c8 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll @@ -1,21 +1,46 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11 %s define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x float> %arg) #0 { +; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -45,6 +70,20 @@ define <2 x half> @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict(<2 x flo } define <3 x half> @v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict(<3 x float> %arg) #0 { +; SI-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; ; GFX8-LABEL: v_constrained_fptrunc_v3f32_to_v3f16_fpexcept_strict: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -126,23 +165,53 @@ define <3 x float> @v_constrained_fptrunc_v3f64_to_v3f32_fpexcept_strict(<3 x do ; } define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x8000, v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") %neg.val = fneg half %val ret half %neg.val } define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #0 { -; GCN-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_cvt_f16_f32_e64 v0, -v0 -; GCN-NEXT: s_setpc_b64 s[30:31] +; SI-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] %neg.arg = fneg float %arg %val = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -171,6 +240,145 @@ define float @v_constrained_fptrunc_fneg_f64_to_f32_fpexcept_strict(double %arg) ret float %val } +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + +define void @v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi(<2 x float> %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: s_mov_b32 s5, s6 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_dword v0, v[2:3], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX8-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: flat_store_dword v[2:3], v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX9-NEXT: s_mov_b32 s4, 0x5040100 +; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 +; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX10-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX10-NEXT: global_store_dword v[2:3], v0, off +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: v_constrained_fptrunc_v2f32_to_v2f16_fpexcept_strict_noabi: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NEXT: global_store_b32 v[2:3], v0, off +; GFX11-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + store <2 x half> %result, ptr addrspace(1) %ptr + ret void +} + +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fneg: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, -v0 +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %neg.arg = fneg float %arg + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %neg.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + +define void @v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs(float %arg, ptr addrspace(1) %ptr) #0 { +; SI-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; GFX1011-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict_noabi_fabs: +; GFX1011: ; %bb.0: +; GFX1011-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1011-NEXT: v_cvt_f16_f32_e64 v0, |v0| +; GFX1011-NEXT: s_setpc_b64 s[30:31] + %abs.arg = call float @llvm.fabs.f32(float %arg) + %result = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %abs.arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret void +} + declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #1 declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #1 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #1 @@ -183,9 +391,7 @@ declare half @llvm.experimental.constrained.fptrunc.f16.f64(double, metadata, me declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f64(<2 x double>, metadata, metadata) #1 declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f64(<3 x double>, metadata, metadata) #1 +declare float @llvm.fabs.f32(float) #1 + attributes #0 = { strictfp } attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX89: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll new file mode 100644 index 00000000000000..a4138f87158519 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strict_sitofp.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; FIXME: Missing operand promote for f16 +; XUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX8 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX89,GFX9 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1100 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1150 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX1011,GFX11,GFX1150 %s + +define half @v_constrained_sitofp_i16_to_f16_fpexcept_strict(i16 %arg) #0 { + %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +define <2 x half> @v_constrained_sitofp_v2i16_to_v2f16_fpexcept_strict(<2 x i16> %arg) #0 { + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <3 x half> @v_constrained_sitofp_v3i16_to_v3f16_fpexcept_strict(<3 x i16> %arg) #0 { + %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x half> %result +} + +define float @v_constrained_sitofp_i16_to_f32_fpexcept_strict(i16 %arg) #0 { + %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_sitofp_v2i16_to_v2f32_fpexcept_strict(<2 x i16> %arg) #0 { + %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define <3 x float> @v_constrained_sitofp_v3i16_to_v3f32_fpexcept_strict(<3 x i16> %arg) #0 { + %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x float> %result +} + +define double @v_constrained_sitofp_i16_to_f64_fpexcept_strict(i16 %arg) #0 { + %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + +define <2 x double> @v_constrained_sitofp_v2i16_to_v2f64_fpexcept_strict(<2 x i16> %arg) #0 { + %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x double> %result +} + +define <3 x double> @v_constrained_sitofp_v3i16_to_v3f64_fpexcept_strict(<3 x i16> %arg) #0 { + %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x double> %result +} + +declare half @llvm.experimental.constrained.sitofp.f16.i16(i16, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i16(<2 x i16>, metadata, metadata) #1 +declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i16(<3 x i16>, metadata, metadata) #1 + +declare float @llvm.experimental.constrained.sitofp.f32.i16(i16, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i16(<2 x i16>, metadata, metadata) #1 +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i16(<3 x i16>, metadata, metadata) #1 + +declare double @llvm.experimental.constrained.sitofp.f64.i16(i16, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i16(<2 x i16>, metadata, metadata) #1 +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i16(<3 x i16>, metadata, metadata) #1 + +define half @v_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 %arg) #0 { + %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +define <2 x half> @v_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> %arg) #0 { + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <3 x half> @v_constrained_sitofp_v3i32_to_v3f16_fpexcept_strict(<3 x i32> %arg) #0 { + %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x half> %result +} + +define float @v_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 %arg) #0 { + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> %arg) #0 { + %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define <3 x float> @v_constrained_sitofp_v3i32_to_v3f32_fpexcept_strict(<3 x i32> %arg) #0 { + %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x float> %result +} + +define double @v_constrained_sitofp_i32_to_f64_fpexcept_strict(i32 %arg) #0 { + %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + +define <2 x double> @v_constrained_sitofp_v2i32_to_v2f64_fpexcept_strict(<2 x i32> %arg) #0 { + %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x double> %result +} + +define <3 x double> @v_constrained_sitofp_v3i32_to_v3f64_fpexcept_strict(<3 x i32> %arg) #0 { + %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x double> %result +} + +declare half @llvm.experimental.constrained.sitofp.f16.i32(i32, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32>, metadata, metadata) #1 +declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i32(<3 x i32>, metadata, metadata) #1 + +declare float @llvm.experimental.constrained.sitofp.f32.i32(i32, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32>, metadata, metadata) #1 +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i32(<3 x i32>, metadata, metadata) #1 + +declare double @llvm.experimental.constrained.sitofp.f64.i32(i32, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i32(<2 x i32>, metadata, metadata) #1 +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i32(<3 x i32>, metadata, metadata) #1 + +define half @s_constrained_sitofp_i32_to_f16_fpexcept_strict(i32 inreg %arg) #0 { + %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +define float @s_constrained_sitofp_i32_to_f32_fpexcept_strict(i32 inreg %arg) #0 { + %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define <2 x half> @s_constrained_sitofp_v2i32_to_v2f16_fpexcept_strict(<2 x i32> inreg %arg) #0 { + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <2 x float> @s_constrained_sitofp_v2i32_to_v2f32_fpexcept_strict(<2 x i32> inreg %arg) #0 { + %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i32(<2 x i32> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define half @v_constrained_sitofp_i64_to_f16_fpexcept_strict(i64 %arg) #0 { + %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +define <2 x half> @v_constrained_sitofp_v2i64_to_v2f16_fpexcept_strict(<2 x i64> %arg) #0 { + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <3 x half> @v_constrained_sitofp_v3i64_to_v3f16_fpexcept_strict(<3 x i64> %arg) #0 { + %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x half> %result +} + +define float @v_constrained_sitofp_i64_to_f32_fpexcept_strict(i64 %arg) #0 { + %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_sitofp_v2i64_to_v2f32_fpexcept_strict(<2 x i64> %arg) #0 { + %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define <3 x float> @v_constrained_sitofp_v3i64_to_v3f32_fpexcept_strict(<3 x i64> %arg) #0 { + %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x float> %result +} + +define double @v_constrained_sitofp_i64_to_f64_fpexcept_strict(i64 %arg) #0 { + %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + +define <2 x double> @v_constrained_sitofp_v2i64_to_v2f64_fpexcept_strict(<2 x i64> %arg) #0 { + %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x double> %result +} + +define <3 x double> @v_constrained_sitofp_v3i64_to_v3f64_fpexcept_strict(<3 x i64> %arg) #0 { + %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x double> %result +} + +declare half @llvm.experimental.constrained.sitofp.f16.i64(i64, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i64(<2 x i64>, metadata, metadata) #1 +declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i64(<3 x i64>, metadata, metadata) #1 + +declare float @llvm.experimental.constrained.sitofp.f32.i64(i64, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i64(<2 x i64>, metadata, metadata) #1 +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i64(<3 x i64>, metadata, metadata) #1 + +declare double @llvm.experimental.constrained.sitofp.f64.i64(i64, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i64(<2 x i64>, metadata, metadata) #1 +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i64(<3 x i64>, metadata, metadata) #1 + +define half @v_constrained_sitofp_i8_to_f16_fpexcept_strict(i8 %arg) #0 { + %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret half %result +} + +define <2 x half> @v_constrained_sitofp_v2i8_to_v2f16_fpexcept_strict(<2 x i8> %arg) #0 { + %result = call <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x half> %result +} + +define <3 x half> @v_constrained_sitofp_v3i8_to_v3f16_fpexcept_strict(<3 x i8> %arg) #0 { + %result = call <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x half> %result +} + +define float @v_constrained_sitofp_i8_to_f32_fpexcept_strict(i8 %arg) #0 { + %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret float %result +} + +define <2 x float> @v_constrained_sitofp_v2i8_to_v2f32_fpexcept_strict(<2 x i8> %arg) #0 { + %result = call <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x float> %result +} + +define <3 x float> @v_constrained_sitofp_v3i8_to_v3f32_fpexcept_strict(<3 x i8> %arg) #0 { + %result = call <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x float> %result +} + +define double @v_constrained_sitofp_i8_to_f64_fpexcept_strict(i8 %arg) #0 { + %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret double %result +} + +define <2 x double> @v_constrained_sitofp_v2i8_to_v2f64_fpexcept_strict(<2 x i8> %arg) #0 { + %result = call <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <2 x double> %result +} + +define <3 x double> @v_constrained_sitofp_v3i8_to_v3f64_fpexcept_strict(<3 x i8> %arg) #0 { + %result = call <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8> %arg, metadata !"round.dynamic", metadata !"fpexcept.strict") + ret <3 x double> %result +} + +declare half @llvm.experimental.constrained.sitofp.f16.i8(i8, metadata, metadata) #1 +declare <2 x half> @llvm.experimental.constrained.sitofp.v2f16.v2i8(<2 x i8>, metadata, metadata) #1 +declare <3 x half> @llvm.experimental.constrained.sitofp.v3f16.v3i8(<3 x i8>, metadata, metadata) #1 + +declare float @llvm.experimental.constrained.sitofp.f32.i8(i8, metadata, metadata) #1 +declare <2 x float> @llvm.experimental.constrained.sitofp.v2f32.v2i8(<2 x i8>, metadata, metadata) #1 +declare <3 x float> @llvm.experimental.constrained.sitofp.v3f32.v3i8(<3 x i8>, metadata, metadata) #1 + +declare double @llvm.experimental.constrained.sitofp.f64.i8(i8, metadata, metadata) #1 +declare <2 x double> @llvm.experimental.constrained.sitofp.v2f64.v2i8(<2 x i8>, metadata, metadata) #1 +declare <3 x double> @llvm.experimental.constrained.sitofp.v3f64.v3i8(<3 x i8>, metadata, metadata) #1 + +attributes #0 = { strictfp } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX8: {{.*}} +; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll new file mode 100644 index 00000000000000..57e4cec4eccb11 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll @@ -0,0 +1,558 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { +; GFX7-LABEL: f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: flat_store_dword v[1:2], v0 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v2f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v2 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: flat_store_dword v[2:3], v4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v3f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v3 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v4, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v6 +; GFX7-NEXT: flat_store_dword v[3:4], v5 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { +; GFX7-LABEL: v4f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v3 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v7 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v4 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: flat_store_dword v[4:5], v6 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + +define half @f16_return(float %arg) #0 { +; GFX7-LABEL: f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret half %fptrunc +} + +define <2 x half> @v2f16_return(<2 x float> %arg) #0 { +; GFX7-LABEL: v2f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <2 x half> %fptrunc +} + +define <3 x half> @v3f16_return(<3 x float> %arg) #0 { +; GFX7-LABEL: v3f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <3 x half> %fptrunc +} + +define <4 x half> @v4f16_return(<4 x float> %arg) #0 { +; GFX7-LABEL: v4f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: s_setpc_b64 s[30:31] + %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <4 x half> %fptrunc +} + +define void @outgoing_f16_arg(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: flat_load_ushort v0, v[0:1] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, f16_user@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, f16_user@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = load half, ptr %ptr + call void @f16_user(half %val) + ret void +} + +define void @outgoing_v2f16_arg(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v2f16_arg: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: flat_load_dword v1, v[0:1] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v2f16_user@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v2f16_user@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1 +; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = load <2 x half>, ptr %ptr + call void @v2f16_user(<2 x half> %val) + ret void +} + +define void @outgoing_f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: flat_store_short v[41:42], v0 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call half @f16_result() + store half %val, ptr %ptr + ret void +} + +define void @outgoing_v2f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v2f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v2f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v2f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-NEXT: flat_store_dword v[41:42], v0 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <2 x half> @v2f16_result() + store <2 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v4f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v4f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v4f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v4f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v4, v0, v1 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_store_dword v[41:42], v4 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <4 x half> @v4f16_result() + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { +; GFX7-LABEL: outgoing_v8f16_return: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: v_mov_b32_e32 v42, v1 +; GFX7-NEXT: v_mov_b32_e32 v41, v0 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX7-NEXT: v_or_b32_e32 v8, v0, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v5 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v2, v2, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3 +; GFX7-NEXT: v_or_b32_e32 v3, v4, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 12, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v3 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 8, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v5 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 4, v41 +; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v42, vcc +; GFX7-NEXT: flat_store_dword v[0:1], v2 +; GFX7-NEXT: flat_store_dword v[41:42], v8 +; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] + %val = call <8 x half> @v8f16_result() + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +; GFX7-LABEL: call_split_type_used_outside_block_v8f16: +; GFX7: ; %bb.0: ; %bb0 +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_mov_b32 s16, s33 +; GFX7-NEXT: s_mov_b32 s33, s32 +; GFX7-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX7-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[18:19] +; GFX7-NEXT: v_writelane_b32 v40, s16, 2 +; GFX7-NEXT: v_writelane_b32 v40, s30, 0 +; GFX7-NEXT: s_mov_b32 s17, v8f16_result@abs32@hi +; GFX7-NEXT: s_mov_b32 s16, v8f16_result@abs32@lo +; GFX7-NEXT: s_addk_i32 s32, 0x400 +; GFX7-NEXT: v_writelane_b32 v40, s31, 1 +; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17] +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX7-NEXT: v_readlane_b32 s31, v40, 1 +; GFX7-NEXT: v_readlane_b32 s30, v40, 0 +; GFX7-NEXT: v_readlane_b32 s4, v40, 2 +; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[6:7] +; GFX7-NEXT: s_addk_i32 s32, 0xfc00 +; GFX7-NEXT: s_mov_b32 s33, s4 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_setpc_b64 s[30:31] +bb0: + %split.ret.type = call <8 x half> @v8f16_result() + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp } diff --git a/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll new file mode 100644 index 00000000000000..2da8ea66a0b958 --- /dev/null +++ b/llvm/test/CodeGen/ARM/strictfp_f16_abi_promote.ll @@ -0,0 +1,115 @@ +; RUN: llc -mtriple=arm-eabi -mattr=+v8.2a,+neon,-fullfp16 -float-abi=hard < %s | FileCheck -check-prefix=NOFP16 %s + +declare void @f16_user(half) +declare half @f16_result() + +declare void @v2f16_user(<2 x half>) +declare <2 x half> @v2f16_result() + +declare void @v4f16_user(<4 x half>) +declare <4 x half> @v4f16_result() + +declare void @v8f16_user(<8 x half>) +declare <8 x half> @v8f16_result() + +define void @f16_arg(half %arg, ptr %ptr) #0 { + %fpext = call float @llvm.experimental.constrained.fpext.f32.f16(half %arg, metadata !"fpexcept.strict") + store float %fpext, ptr %ptr + ret void +} + +define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { + %fpext = call <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half> %arg, metadata !"fpexcept.strict") + store <2 x float> %fpext, ptr %ptr + ret void +} + +define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { + %fpext = call <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half> %arg, metadata !"fpexcept.strict") + store <3 x float> %fpext, ptr %ptr + ret void +} + +define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { + %fpext = call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %arg, metadata !"fpexcept.strict") + store <4 x float> %fpext, ptr %ptr + ret void +} + +define half @f16_return(float %arg) #0 { + %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret half %fptrunc +} + +define <2 x half> @v2f16_return(<2 x float> %arg) #0 { + %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <2 x half> %fptrunc +} + +define <3 x half> @v3f16_return(<3 x float> %arg) #0 { + %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <3 x half> %fptrunc +} + +define <4 x half> @v4f16_return(<4 x float> %arg) #0 { + %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") + ret <4 x half> %fptrunc +} + +define void @outgoing_f16_arg(ptr %ptr) #0 { + %val = load half, ptr %ptr + call void @f16_user(half %val) + ret void +} + +define void @outgoing_v2f16_arg(ptr %ptr) #0 { + %val = load <2 x half>, ptr %ptr + call void @v2f16_user(<2 x half> %val) + ret void +} + +define void @outgoing_f16_return(ptr %ptr) #0 { + %val = call half @f16_result() + store half %val, ptr %ptr + ret void +} + +define void @outgoing_v2f16_return(ptr %ptr) #0 { + %val = call <2 x half> @v2f16_result() + store <2 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v4f16_return(ptr %ptr) #0 { + %val = call <4 x half> @v4f16_result() + store <4 x half> %val, ptr %ptr + ret void +} + +define void @outgoing_v8f16_return(ptr %ptr) #0 { + %val = call <8 x half> @v8f16_result() + store <8 x half> %val, ptr %ptr + ret void +} + +define half @call_split_type_used_outside_block_v8f16() #0 { +bb0: + %split.ret.type = call <8 x half> @v8f16_result() + br label %bb1 + +bb1: + %extract = extractelement <8 x half> %split.ret.type, i32 0 + ret half %extract +} + +declare float @llvm.experimental.constrained.fpext.f32.f16(half, metadata) #0 +declare <2 x float> @llvm.experimental.constrained.fpext.v2f32.v2f16(<2 x half>, metadata) #0 +declare <3 x float> @llvm.experimental.constrained.fpext.v3f32.v3f16(<3 x half>, metadata) #0 +declare <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half>, metadata) #0 + +declare half @llvm.experimental.constrained.fptrunc.f16.f32(float, metadata, metadata) #0 +declare <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float>, metadata, metadata) #0 +declare <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float>, metadata, metadata) #0 +declare <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float>, metadata, metadata) #0 + +attributes #0 = { strictfp }