diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 2795de5eeeb66..69202e3fcbc57 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -50,7 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[ // Sub-128 vectors are returned in the same way, but they're widened // to one of these types during type legalization. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> ]>; @@ -116,19 +116,19 @@ def CC_SystemZ_ELF : CallingConv<[ // are passed in the same way, but they're widened to one of these types // during type legalization. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfArgFixed>>>, // However, sub-128 vectors which need to go on the stack occupy just a // single 8-byte-aligned 8-byte stack slot. Pass as i64. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfShortVector>>>, // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 2511d08a6d0ef..6dc3042d56d8a 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -123,6 +123,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v8f16, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); } @@ -620,6 +621,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Handle floating-point vector types. if (Subtarget.hasVector()) { // Scalar-to-vector conversion is just a subreg. + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); @@ -627,6 +629,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // need to go via integers. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); @@ -842,6 +845,34 @@ bool SystemZTargetLowering::useSoftFloat() const { return Subtarget.hasSoftFloat(); } +unsigned +SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT) const { + // i128 inline assembly operand. + if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped) + return 1; + // Pass narrow fp16 vectors per the ABI even though they are generally + // expanded. + if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) + return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2); + return TargetLowering::getNumRegisters(Context, VT); +} + +MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const { + // 128-bit single-element vector types are passed like other vectors, + // not like their element type. + if (VT.isVector() && VT.getSizeInBits() == 128 && + VT.getVectorNumElements() == 1) + return MVT::v16i8; + // Pass narrow fp16 vectors per the ABI even though they are generally + // expanded. + if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) + return MVT::v8f16; + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); +} + EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) @@ -2051,6 +2082,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments( case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v8f16: case MVT::v4f32: case MVT::v2f64: RC = &SystemZ::VR128BitRegClass; @@ -6351,6 +6383,39 @@ bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const { return false; } +static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL, + unsigned MergedBits, EVT VT, SDValue Op0, + SDValue Op1) { + MVT IntVecVT = MVT::getVectorVT(MVT::getIntegerVT(MergedBits), + SystemZ::VectorBits / MergedBits); + assert(VT.getSizeInBits() == 128 && IntVecVT.getSizeInBits() == 128 && + "Handling full vectors only."); + Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0); + Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1); + SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, DL, IntVecVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); +} + +static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, + EVT VT, SmallVectorImpl &Elems, + unsigned Pos) { + SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]); + SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]); + // Avoid unnecessary undefs by reusing the other operand. + if (Op01.isUndef()) { + if (Op23.isUndef()) + return Op01; + Op01 = Op23; + } + else if (Op23.isUndef()) + Op23 = Op01; + // Merging identical replications is a no-op. + if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23) + return Op01; + unsigned MergedBits = VT.getSimpleVT().getScalarSizeInBits() * 2; + return mergeHighParts(DAG, DL, MergedBits, VT, Op01, Op23); +} + // Combine GPR scalar values Elems into a vector of type VT. SDValue SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, @@ -6409,22 +6474,22 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // // V VMRHG // - if (VT == MVT::v4f32 && !AllLoads) { - SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]); - SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]); + if (VT == MVT::v4f32 && !AllLoads) + return buildFPVecFromScalars4(DAG, DL, VT, Elems, 0); + + // Same for v8i16. + if (VT == MVT::v8f16 && !AllLoads) { + SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0); + SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4); // Avoid unnecessary undefs by reusing the other operand. - if (Op01.isUndef()) - Op01 = Op23; - else if (Op23.isUndef()) - Op23 = Op01; + if (Op0123.isUndef()) + Op0123 = Op4567; + else if (Op4567.isUndef()) + Op4567 = Op0123; // Merging identical replications is a no-op. - if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23) - return Op01; - Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01); - Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23); - SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, - DL, MVT::v2i64, Op01, Op23); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); + if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567) + return Op0123; + return mergeHighParts(DAG, DL, 64, VT, Op0123, Op4567); } // Collect the constant terms. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 13a1cd1614a53..9ea7f3e556971 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -64,27 +64,20 @@ class SystemZTargetLowering : public TargetLowering { // // (c) there are no multiplication instructions for the widest integer // type (v2i64). + + // Expand (narrow) f16 vectors during type legalization to avoid + // operations for all elements as with expansion after widening. + if (VT.getScalarType() == MVT::f16) + return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector + : TypeSplitVector; if (VT.getScalarSizeInBits() % 8 == 0) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } - unsigned - getNumRegisters(LLVMContext &Context, EVT VT, - std::optional RegisterVT) const override { - // i128 inline assembly operand. - if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped) - return 1; - return TargetLowering::getNumRegisters(Context, VT); - } + unsigned getNumRegisters(LLVMContext &Context, EVT VT, + std::optional RegisterVT) const override; MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, - EVT VT) const override { - // 128-bit single-element vector types are passed like other vectors, - // not like their element type. - if (VT.isVector() && VT.getSizeInBits() == 128 && - VT.getVectorNumElements() == 1) - return MVT::v16i8; - return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); - } + EVT VT) const override; bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool isCheapToSpeculateCttz(Type *) const override { return true; } bool preferZeroCompareBranch() const override { return true; } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 479bab5ce62b8..3eb66d06cc16d 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -348,6 +348,7 @@ let Predicates = [FeatureVector] in { def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>; def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>; def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; def : BinaryRRWithType; @@ -357,6 +358,7 @@ let Predicates = [FeatureVector] in { def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>; def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>; def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; def : BinaryRRWithType; @@ -497,6 +499,7 @@ defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; +defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; @@ -2110,6 +2113,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (i128 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; @@ -2118,6 +2122,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (i128 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; @@ -2126,6 +2131,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (i128 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; @@ -2134,15 +2140,26 @@ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (i128 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (i128 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (f128 VR128:$src))), (v8f16 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (i128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8f16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; @@ -2151,6 +2168,7 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (i128 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8f16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; @@ -2159,6 +2177,7 @@ def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (i128 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; @@ -2166,6 +2185,7 @@ def : Pat<(i128 (bitconvert (v16i8 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v8i16 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v4i32 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v2i64 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v8f16 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v4f32 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v2f64 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (f128 VR128:$src))), (i128 VR128:$src)>; @@ -2216,6 +2236,7 @@ multiclass ScalarToVectorFP; } +defm : ScalarToVectorFP; defm : ScalarToVectorFP; defm : ScalarToVectorFP; @@ -2236,6 +2257,11 @@ let AddedComplexity = 4 in { // 3 added by TableGen for the base register operand in VLGV-based integer // extractions and ensures that this version is strictly better. let AddedComplexity = 4 in { + def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), 0)), + (EXTRACT_SUBREG VR128:$vec, subreg_h16)>; + def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), imm32zx3:$index)), + (EXTRACT_SUBREG (VREPH VR128:$vec, imm32zx2:$index), subreg_h16)>; + def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)), (EXTRACT_SUBREG VR128:$vec, subreg_h32)>; def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)), diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index e79f12b449a88..1ef8e81c8f829 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -305,13 +305,13 @@ defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64, // The subset of vector registers that can be used for floating-point // operations too. defm VF128 : SystemZRegClass<"VF128", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, - (sequence "V%u", 0, 15)>; + [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + 128, (sequence "V%u", 0, 15)>; // All vector registers. defm VR128 : SystemZRegClass<"VR128", [v16i8, v8i16, v4i32, v2i64, i128, - v4f32, v2f64, f128], + v8f16, v4f32, v2f64, f128], 128, (add (sequence "V%u", 0, 7), (sequence "V%u", 16, 31), (sequence "V%u", 8, 15))>; diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll index e02f931c4d31e..d0f3414e89497 100644 --- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll +++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll @@ -111,87 +111,93 @@ define void @canonicalize_ptr_f128(ptr %out) { define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind { ; Z16-LABEL: canonicalize_v8f16: ; Z16: # %bb.0: -; Z16-NEXT: stmg %r13, %r15, 104(%r15) +; Z16-NEXT: stmg %r14, %r15, 112(%r15) ; Z16-NEXT: aghi %r15, -224 -; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill -; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill -; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill -; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill -; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill -; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill -; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill -; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill -; Z16-NEXT: vlreph %v11, 414(%r15) -; Z16-NEXT: vlreph %v12, 406(%r15) -; Z16-NEXT: vlreph %v13, 398(%r15) -; Z16-NEXT: vlreph %v14, 390(%r15) -; Z16-NEXT: ldr %f8, %f6 -; Z16-NEXT: ldr %f9, %f4 -; Z16-NEXT: ldr %f10, %f2 -; Z16-NEXT: lgr %r13, %r2 +; Z16-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill +; Z16-NEXT: vreph %v0, %v24, 7 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f15, %f0 -; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 6 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f10, %f0 -; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 5 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f9, %f0 -; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 4 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f8, %f0 -; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 3 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f14, %f0 -; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 2 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f13, %f0 -; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f12, %f0 -; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 1 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: vsteh %v0, 14(%r13), 0 -; Z16-NEXT: vsteh %v12, 12(%r13), 0 -; Z16-NEXT: vsteh %v13, 10(%r13), 0 -; Z16-NEXT: vsteh %v14, 8(%r13), 0 -; Z16-NEXT: vsteh %v8, 6(%r13), 0 -; Z16-NEXT: vsteh %v9, 4(%r13), 0 -; Z16-NEXT: vsteh %v10, 2(%r13), 0 -; Z16-NEXT: vsteh %v15, 0(%r13), 0 -; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; Z16-NEXT: lmg %r13, %r15, 328(%r15) +; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v1, %v0 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhg %v24, %v0, %v1 +; Z16-NEXT: lmg %r14, %r15, 336(%r15) ; Z16-NEXT: br %r14 %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %a) ret <8 x half> %canonicalized @@ -253,85 +259,93 @@ define void @canonicalize_ptr_v8f16(ptr %out) nounwind { ; Z16: # %bb.0: ; Z16-NEXT: stmg %r13, %r15, 104(%r15) ; Z16-NEXT: aghi %r15, -224 -; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill -; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill -; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill -; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill -; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill -; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill -; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill -; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill -; Z16-NEXT: vlreph %v0, 0(%r2) -; Z16-NEXT: vlreph %v8, 14(%r2) -; Z16-NEXT: vlreph %v9, 12(%r2) -; Z16-NEXT: vlreph %v10, 10(%r2) +; Z16-NEXT: vl %v0, 0(%r2), 3 ; Z16-NEXT: lgr %r13, %r2 -; Z16-NEXT: vlreph %v11, 8(%r2) -; Z16-NEXT: vlreph %v12, 6(%r2) -; Z16-NEXT: vlreph %v13, 4(%r2) -; Z16-NEXT: vlreph %v14, 2(%r2) +; Z16-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; Z16-NEXT: vreph %v0, %v0, 7 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f15, %f0 -; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 6 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f14, %f0 -; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 5 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f13, %f0 -; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 4 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f12, %f0 -; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 3 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f11, %f0 -; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 2 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f10, %f0 -; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f9, %f0 -; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 1 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: vsteh %v9, 12(%r13), 0 -; Z16-NEXT: vsteh %v10, 10(%r13), 0 -; Z16-NEXT: vsteh %v11, 8(%r13), 0 -; Z16-NEXT: vsteh %v12, 6(%r13), 0 -; Z16-NEXT: vsteh %v13, 4(%r13), 0 -; Z16-NEXT: vsteh %v14, 2(%r13), 0 -; Z16-NEXT: vsteh %v15, 0(%r13), 0 -; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; Z16-NEXT: vsteh %v0, 14(%r13), 0 +; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v1, %v0 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhg %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 0(%r13), 3 ; Z16-NEXT: lmg %r13, %r15, 328(%r15) ; Z16-NEXT: br %r14 %val = load <8 x half>, ptr %out diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll new file mode 100644 index 0000000000000..8d28e8317bb62 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-args.ll @@ -0,0 +1,709 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test passing fp16 vector arguments. + +@Fnptr = external global ptr +@Src = external global ptr +@Dst = external global ptr + +%Ty0 = type <8 x half> +define void @fun0_arg(%Ty0 %A) { +; CHECK-LABEL: fun0_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 166(%r15) +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgh %r1, 174(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: lgh %r0, 182(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: lgh %r2, 190(%r15) +; CHECK-NEXT: ldgr %f3, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r1) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r1) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r1) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r1) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r1) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r1) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0_arg: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vst %v24, 0(%r1), 3 +; VECTOR-NEXT: br %r14 + store %Ty0 %A, ptr @Dst + ret void +} + +define void @fun0_call() { +; CHECK-LABEL: fun0_call: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: lgh %r2, 2(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f2, %r2 +; CHECK-NEXT: lgh %r2, 6(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: lgh %r0, 8(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f6, %r2 +; CHECK-NEXT: lgh %r2, 10(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: lgh %r0, 12(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: lgh %r1, 14(%r1) +; CHECK-NEXT: ldgr %f3, %r2 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 190(%r15) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 182(%r15) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 174(%r15) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 166(%r15) +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d +; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d +; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0_call: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vl %v24, 0(%r1), 3 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %L = load %Ty0, ptr @Src + call void @Fnptr(%Ty0 %L) + ret void +} + +define %Ty0 @fun0_ret() { +; CHECK-LABEL: fun0_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lg %r0, 8(%r1) +; CHECK-NEXT: lg %r1, 0(%r1) +; CHECK-NEXT: stg %r0, 8(%r2) +; CHECK-NEXT: stg %r1, 0(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0_ret: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vl %v24, 0(%r1), 3 +; VECTOR-NEXT: br %r14 + %L = load %Ty0, ptr @Src + ret %Ty0 %L +} + +define void @fun0_store_returned() { +; CHECK-LABEL: fun0_store_returned: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: lg %r0, 168(%r15) +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: lg %r2, 160(%r15) +; CHECK-NEXT: stg %r0, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0_store_returned: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vst %v24, 0(%r1), 3 +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %C = call %Ty0 @Fnptr() + store %Ty0 %C, ptr @Dst + ret void +} + +%Ty1 = type <4 x half> +define void @fun1_arg(%Ty1 %A) { +; CHECK-LABEL: fun1_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r1) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r1) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1_arg: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vreph %v0, %v24, 1 +; VECTOR-NEXT: vreph %v1, %v24, 2 +; VECTOR-NEXT: vreph %v2, %v24, 3 +; VECTOR-NEXT: vsteh %v24, 0(%r1), 0 +; VECTOR-NEXT: vsteh %v2, 6(%r1), 0 +; VECTOR-NEXT: vsteh %v1, 4(%r1), 0 +; VECTOR-NEXT: vsteh %v0, 2(%r1), 0 +; VECTOR-NEXT: br %r14 + store %Ty1 %A, ptr @Dst + ret void +} + +define void @fun1_call() { +; CHECK-LABEL: fun1_call: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: lgh %r2, 2(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: lgh %r1, 6(%r1) +; CHECK-NEXT: ldgr %f2, %r2 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f6, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d +; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d +; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1_call: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vlreph %v0, 0(%r1) +; VECTOR-NEXT: vlreph %v1, 2(%r1) +; VECTOR-NEXT: vlreph %v2, 4(%r1) +; VECTOR-NEXT: vlreph %v3, 6(%r1) +; VECTOR-NEXT: vmrhh %v2, %v2, %v3 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vmrhf %v0, %v0, %v2 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %L = load %Ty1, ptr @Src + call void @Fnptr(%Ty1 %L) + ret void +} + +define %Ty1 @fun1_ret() { +; CHECK-LABEL: fun1_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: lgh %r2, 2(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: lgh %r1, 6(%r1) +; CHECK-NEXT: ldgr %f2, %r2 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f6, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d +; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d +; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1_ret: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vlreph %v0, 0(%r1) +; VECTOR-NEXT: vlreph %v1, 2(%r1) +; VECTOR-NEXT: vlreph %v2, 4(%r1) +; VECTOR-NEXT: vlreph %v3, 6(%r1) +; VECTOR-NEXT: vmrhh %v2, %v2, %v3 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vmrhf %v0, %v0, %v2 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 + %L = load %Ty1, ptr @Src + ret %Ty1 %L +} + +define void @fun1_store_returned() { +; CHECK-LABEL: fun1_store_returned: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r1) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r1) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1_store_returned: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vreph %v0, %v24, 1 +; VECTOR-NEXT: vreph %v1, %v24, 2 +; VECTOR-NEXT: vreph %v2, %v24, 3 +; VECTOR-NEXT: vsteh %v24, 0(%r1), 0 +; VECTOR-NEXT: vsteh %v2, 6(%r1), 0 +; VECTOR-NEXT: vsteh %v1, 4(%r1), 0 +; VECTOR-NEXT: vsteh %v0, 2(%r1), 0 +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %C = call %Ty1 @Fnptr() + store %Ty1 %C, ptr @Dst + ret void +} + +%Ty2 = type <16 x half> +define void @fun2_arg(%Ty2 %A) { +; CHECK-LABEL: fun2_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r15, -64 +; CHECK-NEXT: .cfi_def_cfa_offset 224 +; CHECK-NEXT: std %f8, 56(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 48(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 40(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 32(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 24(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 16(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 8(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 0(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 230(%r15) +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgh %r1, 238(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: lgh %r0, 246(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f3, %r1 +; CHECK-NEXT: lgh %r1, 254(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: lgh %r0, 262(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f7, %r1 +; CHECK-NEXT: lgh %r1, 270(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 278(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f9, %r1 +; CHECK-NEXT: lgh %r1, 286(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 294(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f11, %r1 +; CHECK-NEXT: lgh %r1, 302(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 310(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: lgh %r2, 318(%r15) +; CHECK-NEXT: ldgr %f13, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f14, %r0 +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: ldgr %f15, %r0 +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r1) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r1) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: lgdr %r0, %f15 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 30(%r1) +; CHECK-NEXT: lgdr %r0, %f14 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 28(%r1) +; CHECK-NEXT: lgdr %r0, %f13 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 26(%r1) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 24(%r1) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 22(%r1) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 20(%r1) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 18(%r1) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 16(%r1) +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r1) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r1) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r1) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r1) +; CHECK-NEXT: ld %f8, 56(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 48(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 40(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 32(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 24(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 16(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 8(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 0(%r15) # 8-byte Reload +; CHECK-NEXT: aghi %r15, 64 +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2_arg: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vst %v26, 16(%r1), 4 +; VECTOR-NEXT: vst %v24, 0(%r1), 4 +; VECTOR-NEXT: br %r14 + store %Ty2 %A, ptr @Dst + ret void +} + +define void @fun2_call() { +; CHECK-LABEL: fun2_call: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -320 +; CHECK-NEXT: .cfi_def_cfa_offset 480 +; CHECK-NEXT: std %f8, 312(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 304(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 296(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 288(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 280(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 272(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 264(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 256(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: lgh %r2, 2(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f2, %r2 +; CHECK-NEXT: lgh %r2, 6(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: lgh %r0, 8(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f6, %r2 +; CHECK-NEXT: lgh %r2, 10(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: lgh %r0, 12(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f3, %r2 +; CHECK-NEXT: lgh %r2, 14(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: lgh %r0, 16(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f7, %r2 +; CHECK-NEXT: lgh %r2, 18(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 20(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f9, %r2 +; CHECK-NEXT: lgh %r2, 22(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 24(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: ldgr %f11, %r2 +; CHECK-NEXT: lgh %r2, 26(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 28(%r1) +; CHECK-NEXT: sllg %r2, %r2, 48 +; CHECK-NEXT: lgh %r1, 30(%r1) +; CHECK-NEXT: ldgr %f13, %r2 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f14, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f15, %r0 +; CHECK-NEXT: lgdr %r0, %f15 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 254(%r15) +; CHECK-NEXT: lgdr %r0, %f14 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 246(%r15) +; CHECK-NEXT: lgdr %r0, %f13 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 238(%r15) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 230(%r15) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 222(%r15) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 214(%r15) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 206(%r15) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 198(%r15) +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 190(%r15) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 182(%r15) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 174(%r15) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 166(%r15) +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: # kill: def $f2h killed $f2h killed $f2d +; CHECK-NEXT: # kill: def $f4h killed $f4h killed $f4d +; CHECK-NEXT: # kill: def $f6h killed $f6h killed $f6d +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: ld %f8, 312(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 304(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 296(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 288(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 280(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 272(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 264(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 256(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 432(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2_call: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vl %v26, 16(%r1), 4 +; VECTOR-NEXT: vl %v24, 0(%r1), 4 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %L = load %Ty2, ptr @Src + call void @Fnptr(%Ty2 %L) + ret void +} + +define %Ty2 @fun2_ret() { +; CHECK-LABEL: fun2_ret: +; CHECK: # %bb.0: +; CHECK-NEXT: lgrl %r1, Src@GOT +; CHECK-NEXT: lg %r0, 24(%r1) +; CHECK-NEXT: lg %r3, 16(%r1) +; CHECK-NEXT: lg %r4, 8(%r1) +; CHECK-NEXT: lg %r1, 0(%r1) +; CHECK-NEXT: stg %r0, 24(%r2) +; CHECK-NEXT: stg %r3, 16(%r2) +; CHECK-NEXT: stg %r4, 8(%r2) +; CHECK-NEXT: stg %r1, 0(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2_ret: +; VECTOR: # %bb.0: +; VECTOR-NEXT: lgrl %r1, Src@GOT +; VECTOR-NEXT: vl %v24, 0(%r1), 4 +; VECTOR-NEXT: vl %v26, 16(%r1), 4 +; VECTOR-NEXT: br %r14 + %L = load %Ty2, ptr @Src + ret %Ty2 %L +} + +define void @fun2_store_returned() { +; CHECK-LABEL: fun2_store_returned: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, Fnptr@PLT +; CHECK-NEXT: lg %r0, 184(%r15) +; CHECK-NEXT: lgrl %r1, Dst@GOT +; CHECK-NEXT: lg %r2, 176(%r15) +; CHECK-NEXT: lg %r3, 168(%r15) +; CHECK-NEXT: lg %r4, 160(%r15) +; CHECK-NEXT: stg %r0, 24(%r1) +; CHECK-NEXT: stg %r2, 16(%r1) +; CHECK-NEXT: stg %r3, 8(%r1) +; CHECK-NEXT: stg %r4, 0(%r1) +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2_store_returned: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: brasl %r14, Fnptr@PLT +; VECTOR-NEXT: lgrl %r1, Dst@GOT +; VECTOR-NEXT: vst %v26, 16(%r1), 4 +; VECTOR-NEXT: vst %v24, 0(%r1), 4 +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %C = call %Ty2 @Fnptr() + store %Ty2 %C, ptr @Dst + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll new file mode 100644 index 0000000000000..825472299d028 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test some fp16 vector operations, which must be scalarized. With less than +; 8 elements there should only be operations emitted for the used elements. + +%Ty0 = type <8 x half> +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -288 +; CHECK-NEXT: .cfi_def_cfa_offset 448 +; CHECK-NEXT: std %f8, 280(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 272(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 264(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 256(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 248(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 240(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 232(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 224(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 14(%r2) +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: lgh %r1, 12(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 216(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 10(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 208(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 8(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 192(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 176(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 160(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 30(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f13, %r1 +; CHECK-NEXT: lgh %r1, 28(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 26(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 168(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 24(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 22(%r2) +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: lgh %r1, 20(%r2) +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 18(%r2) +; CHECK-NEXT: lgh %r2, 16(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f14, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: sllg %r1, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: ldgr %f15, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f13, %f0 +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ld %f0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 176(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f12 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f14, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ld %f0, 192(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f11 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ld %f0, 200(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ld %f0, 168(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 208(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f12 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 184(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f15, %f0 +; CHECK-NEXT: ld %f0, 216(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f15 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r13) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r13) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r13) +; CHECK-NEXT: lgdr %r0, %f14 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f13 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 280(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 272(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 264(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 256(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 248(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 240(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 232(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 224(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 392(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -248 +; VECTOR-NEXT: .cfi_def_cfa_offset 408 +; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: vl %v0, 16(%r2), 3 +; VECTOR-NEXT: mvc 160(16,%r15), 0(%r2) # 16-byte Folded Spill +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vmrhf %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 224(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 224(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v1, %v0 +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vmrhf %v0, %v0, %v1 +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload +; VECTOR-NEXT: vmrhg %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 0(%r13), 3 +; VECTOR-NEXT: lmg %r13, %r15, 352(%r15) +; VECTOR-NEXT: br %r14 + %LHS = load %Ty0, ptr %Src + %S2 = getelementptr %Ty0, ptr %Src, i32 1 + %RHS = load %Ty0, ptr %S2 + %Res = fadd %Ty0 %LHS, %RHS + store %Ty0 %Res, ptr %Dst + ret void +} + +%Ty1 = type <4 x half> +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -224 +; CHECK-NEXT: .cfi_def_cfa_offset 384 +; CHECK-NEXT: std %f8, 216(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 208(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 200(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 192(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 184(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 176(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: lgh %r1, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f9, %r1 +; CHECK-NEXT: lgh %r1, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 14(%r2) +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: lgh %r1, 12(%r2) +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: lgh %r0, 10(%r2) +; CHECK-NEXT: lgh %r2, 8(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f13, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: sllg %r1, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: ldgr %f14, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f15, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f15 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f14, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f14 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f13, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f13 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f11 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 328(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: .cfi_offset %f12, -200 +; VECTOR-NEXT: .cfi_offset %f13, -208 +; VECTOR-NEXT: .cfi_offset %f14, -216 +; VECTOR-NEXT: .cfi_offset %f15, -224 +; VECTOR-NEXT: vlreph %v0, 8(%r2) +; VECTOR-NEXT: vlreph %v8, 6(%r2) +; VECTOR-NEXT: vlreph %v9, 4(%r2) +; VECTOR-NEXT: vlreph %v10, 2(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vlreph %v11, 0(%r2) +; VECTOR-NEXT: vlreph %v12, 14(%r2) +; VECTOR-NEXT: vlreph %v13, 12(%r2) +; VECTOR-NEXT: vlreph %v14, 10(%r2) +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f15, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f15 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f11, %f0 +; VECTOR-NEXT: ldr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f14, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f13, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f12, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v11, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 6(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) +; VECTOR-NEXT: br %r14 + %LHS = load %Ty1, ptr %Src + %S2 = getelementptr %Ty1, ptr %Src, i32 1 + %RHS = load %Ty1, ptr %S2 + %Res = fsub %Ty1 %LHS, %RHS + store %Ty1 %Res, ptr %Dst + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll new file mode 100644 index 0000000000000..d19f393bfa11a --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test conversions between different-sized float elements. + +; Test cases where both elements of a v2f64 are converted to f16s. +define void @f1(<2 x double> %val, ptr %ptr) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: ldr %f8, %f2 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ldr %f0, %f8 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: lgr %r13, %r2 +; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vrepg %v0, %v24, 1 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v8, 2(%r13), 0 +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %res = fptrunc <2 x double> %val to <2 x half> + store <2 x half> %res, ptr %ptr + ret void +} + +; Test conversion of an f64 in a vector register to an f16. +define half @f2(<2 x double> %vec) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %scalar = extractelement <2 x double> %vec, i32 0 + %ret = fptrunc double %scalar to half + ret half %ret +} + +; Test cases where even elements of a v4f16 are converted to f64s. +define <2 x double> @f3(<4 x half> %vec) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f4 +; CHECK-NEXT: brasl %r14, __extendhfdf2@PLT +; CHECK-NEXT: ldr %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfdf2@PLT +; CHECK-NEXT: ldr %f2, %f0 +; CHECK-NEXT: ldr %f0, %f9 +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: vreph %v1, %v24, 2 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: vst %v1, 176(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vmrhg %v24, %v1, %v0 +; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 + %shuffle = shufflevector <4 x half> %vec, <4 x half> %vec, <2 x i32> + %res = fpext <2 x half> %shuffle to <2 x double> + ret <2 x double> %res +} + +; Test conversion of an f16 in a vector register to an f32. +define float @f4(<4 x half> %vec) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f4: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %scalar = extractelement <4 x half> %vec, i32 0 + %ret = fpext half %scalar to float + ret float %ret +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll new file mode 100644 index 0000000000000..0500f43b7f33e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll @@ -0,0 +1,503 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test fcmp and select with fp16 vectors. + +; Use of vsel with full vector. +%Ty0 = type <8 x half> +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r6, %r15, 48(%r15) +; CHECK-NEXT: .cfi_offset %r6, -112 +; CHECK-NEXT: .cfi_offset %r7, -104 +; CHECK-NEXT: .cfi_offset %r8, -96 +; CHECK-NEXT: .cfi_offset %r9, -88 +; CHECK-NEXT: .cfi_offset %r10, -80 +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -272 +; CHECK-NEXT: .cfi_def_cfa_offset 432 +; CHECK-NEXT: std %f8, 264(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 256(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 248(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 240(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 232(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 224(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 216(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 208(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 14(%r2) +; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 12(%r2) +; CHECK-NEXT: stg %r0, 160(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r12, %r0, 48 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f9, %r0 +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 30(%r2) +; CHECK-NEXT: stg %r0, 192(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 28(%r2) +; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 22(%r2) +; CHECK-NEXT: sllg %r10, %r0, 48 +; CHECK-NEXT: lgh %r0, 20(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f13, %r0 +; CHECK-NEXT: lgh %r0, 18(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 16(%r2) +; CHECK-NEXT: lgh %r8, 10(%r2) +; CHECK-NEXT: lgh %r6, 8(%r2) +; CHECK-NEXT: lgh %r7, 26(%r2) +; CHECK-NEXT: lgh %r11, 24(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f11 +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: sllg %r6, %r6, 48 +; CHECK-NEXT: sllg %r9, %r11, 48 +; CHECK-NEXT: ldgr %f11, %r12 +; CHECK-NEXT: ldgr %f15, %r10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 176(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: sllg %r11, %r8, 48 +; CHECK-NEXT: sllg %r8, %r7, 48 +; CHECK-NEXT: ldgr %f12, %r6 +; CHECK-NEXT: ldgr %f14, %r9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 168(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r12, %r0, 48 +; CHECK-NEXT: lg %r0, 184(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r10, %r0, 48 +; CHECK-NEXT: ldgr %f13, %r11 +; CHECK-NEXT: ldgr %f8, %r8 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 160(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB0_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB0_8: +; CHECK-NEXT: lg %r0, 200(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r11, %r0, 48 +; CHECK-NEXT: lg %r0, 192(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r9, %r0, 48 +; CHECK-NEXT: ldgr %f15, %r12 +; CHECK-NEXT: ldgr %f9, %r10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB0_10 +; CHECK-NEXT: # %bb.9: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB0_10: +; CHECK-NEXT: ldgr %f14, %r11 +; CHECK-NEXT: ldgr %f10, %r9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_12 +; CHECK-NEXT: # %bb.11: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_12: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB0_14 +; CHECK-NEXT: # %bb.13: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB0_14: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB0_16 +; CHECK-NEXT: # %bb.15: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB0_16: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r13) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r13) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: ld %f8, 264(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 256(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: ld %f10, 248(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 240(%r15) # 8-byte Reload +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 232(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: ld %f13, 224(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 216(%r15) # 8-byte Reload +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lg %r0, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 208(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lg %r0, 176(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: lmg %r6, %r15, 320(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r11, %r15, 88(%r15) +; VECTOR-NEXT: .cfi_offset %r11, -72 +; VECTOR-NEXT: .cfi_offset %r12, -64 +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -216 +; VECTOR-NEXT: .cfi_def_cfa_offset 376 +; VECTOR-NEXT: std %f8, 208(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: vl %v0, 16(%r2), 3 +; VECTOR-NEXT: mvc 176(16,%r15), 0(%r2) # 16-byte Folded Spill +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r11, 0 +; VECTOR-NEXT: lhi %r12, 0 +; VECTOR-NEXT: lochie %r11, -1 +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgp %v0, %r0, %r11 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 1 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 2 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 4 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 5 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v2, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lochie %r12, -1 +; VECTOR-NEXT: vlvgh %v2, %r12, 6 +; VECTOR-NEXT: ld %f8, 208(%r15) # 8-byte Reload +; VECTOR-NEXT: vsel %v0, %v0, %v1, %v2 +; VECTOR-NEXT: vst %v0, 0(%r13), 3 +; VECTOR-NEXT: lmg %r11, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 + %A = load %Ty0, ptr %Src + %S2 = getelementptr %Ty0, ptr %Src, i32 1 + %B = load %Ty0, ptr %S2 + %C = fcmp oeq %Ty0 %A, %B + %S = select <8 x i1> %C, %Ty0 %A, %Ty0 %B + store %Ty0 %S, ptr %Dst + ret void +} + +%Ty1 = type <2 x half> +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: std %f8, 184(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 176(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: lmg %r13, %r15, 296(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: vlreph %v0, 4(%r2) +; VECTOR-NEXT: vlreph %v8, 2(%r2) +; VECTOR-NEXT: vlreph %v11, 0(%r2) +; VECTOR-NEXT: vlreph %v9, 6(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f10 +; VECTOR-NEXT: je .LBB1_2 +; VECTOR-NEXT: # %bb.1: +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: .LBB1_2: +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f9 +; VECTOR-NEXT: je .LBB1_4 +; VECTOR-NEXT: # %bb.3: +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: .LBB1_4: +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v10, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 2(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 + %A = load %Ty1, ptr %Src + %S2 = getelementptr %Ty1, ptr %Src, i32 1 + %B = load %Ty1, ptr %S2 + %C = fcmp oeq %Ty1 %A, %B + %S = select <2 x i1> %C, %Ty1 %A, %Ty1 %B + store %Ty1 %S, ptr %Dst + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll new file mode 100644 index 0000000000000..b21c538c89ea9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll @@ -0,0 +1,149 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test loading-and-store fp16 vectors. + +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: lgh %r1, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r1 +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f2, %r0 +; CHECK-NEXT: lgh %r0, 8(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f3, %r1 +; CHECK-NEXT: lgh %r1, 10(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: lgh %r0, 12(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: lgh %r2, 14(%r2) +; CHECK-NEXT: ldgr %f5, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f6, %r0 +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r3) +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r3) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r3) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r3) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r3) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r3) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r3) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vl %v0, 0(%r2), 3 +; VECTOR-NEXT: vst %v0, 0(%r3), 3 +; VECTOR-NEXT: br %r14 + %L = load <8 x half>, ptr %Src + store <8 x half> %L, ptr %Dst + ret void +} + +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: l %r2, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: st %r2, 0(%r3) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r3) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: l %r0, 0(%r2) +; VECTOR-NEXT: vlreph %v0, 4(%r2) +; VECTOR-NEXT: vlreph %v1, 6(%r2) +; VECTOR-NEXT: vsteh %v1, 6(%r3), 0 +; VECTOR-NEXT: vsteh %v0, 4(%r3), 0 +; VECTOR-NEXT: st %r0, 0(%r3) +; VECTOR-NEXT: br %r14 + %L = load <4 x half>, ptr %Src + store <4 x half> %L, ptr %Dst + ret void +} + +define void @fun2(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: lg %r1, 8(%r2) +; CHECK-NEXT: lg %r2, 16(%r2) +; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: stg %r1, 8(%r3) +; CHECK-NEXT: stg %r2, 16(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vl %v0, 0(%r2), 4 +; VECTOR-NEXT: vst %v0, 0(%r3), 4 +; VECTOR-NEXT: lg %r0, 16(%r2) +; VECTOR-NEXT: stg %r0, 16(%r3) +; VECTOR-NEXT: br %r14 + %L = load <12 x half>, ptr %Src + store <12 x half> %L, ptr %Dst + ret void +} + +define void @fun3(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun3: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 24(%r2) +; CHECK-NEXT: lg %r1, 16(%r2) +; CHECK-NEXT: lg %r4, 8(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 24(%r3) +; CHECK-NEXT: stg %r1, 16(%r3) +; CHECK-NEXT: stg %r4, 8(%r3) +; CHECK-NEXT: stg %r2, 0(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vl %v0, 16(%r2), 4 +; VECTOR-NEXT: vl %v1, 0(%r2), 4 +; VECTOR-NEXT: vst %v1, 0(%r3), 4 +; VECTOR-NEXT: vst %v0, 16(%r3), 4 +; VECTOR-NEXT: br %r14 + %L = load <16 x half>, ptr %Src + store <16 x half> %L, ptr %Dst + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll new file mode 100644 index 0000000000000..48d2f4b60c62f --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll @@ -0,0 +1,99 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test insertions into fp16 undef vectors. + +define <8 x half> @f0(half %val) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vreph %v24, %v0, 0 +; VECTOR-NEXT: br %r14 + %ret = insertelement <8 x half> undef, half %val, i32 2 + ret <8 x half> %ret +} + +define <8 x half> @f1(half %val) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vreph %v24, %v0, 0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> undef, half %val, i32 2 + %ret = insertelement <8 x half> %v0, half %val, i32 3 + ret <8 x half> %ret +} + +define <8 x half> @f2(half %val0, half %val1) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v2 +; VECTOR-NEXT: vmrhf %v0, %v0, %v0 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> undef, half %val0, i32 2 + %ret = insertelement <8 x half> %v0, half %val1, i32 3 + ret <8 x half> %ret +} + +define <8 x half> @f3(half %val0, half %val1) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r2) +; CHECK-NEXT: lgdr %r1, %f0 +; CHECK-NEXT: srlg %r1, %r1, 48 +; CHECK-NEXT: sth %r1, 8(%r2) +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: sth %r1, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v2 +; VECTOR-NEXT: vmrhf %v0, %v0, %v0 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> undef, half %val0, i32 2 + %v1 = insertelement <8 x half> %v0, half %val1, i32 3 + %v2 = insertelement <8 x half> %v1, half %val0, i32 4 + %ret = insertelement <8 x half> %v2, half %val1, i32 5 + ret <8 x half> %ret +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll deleted file mode 100644 index 4997c5b0c617d..0000000000000 --- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll +++ /dev/null @@ -1,725 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ -; RUN: | FileCheck %s --check-prefix=NOVEC -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ -; RUN: | FileCheck %s --check-prefix=VECTOR - -; Add the <8 x half> argument with itself and return it. -define <8 x half> @fun0(<8 x half> %Op) { -; NOVEC-LABEL: fun0: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) -; NOVEC-NEXT: .cfi_offset %r13, -56 -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -224 -; NOVEC-NEXT: .cfi_def_cfa_offset 384 -; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: .cfi_offset %f12, -200 -; NOVEC-NEXT: .cfi_offset %f13, -208 -; NOVEC-NEXT: .cfi_offset %f14, -216 -; NOVEC-NEXT: .cfi_offset %f15, -224 -; NOVEC-NEXT: lgh %r0, 414(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f14, %r0 -; NOVEC-NEXT: lgh %r0, 406(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f12, %r0 -; NOVEC-NEXT: lgh %r0, 398(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f9, %r0 -; NOVEC-NEXT: lgh %r0, 390(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ler %f10, %f6 -; NOVEC-NEXT: ler %f11, %f4 -; NOVEC-NEXT: ler %f13, %f2 -; NOVEC-NEXT: ler %f15, %f0 -; NOVEC-NEXT: lgr %r13, %r2 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f8, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ler %f0, %f12 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f12, %f0 -; NOVEC-NEXT: ler %f0, %f14 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f14, %f0 -; NOVEC-NEXT: ler %f0, %f15 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f15, %f0 -; NOVEC-NEXT: ler %f0, %f13 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f13, %f0 -; NOVEC-NEXT: ler %f0, %f11 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f11, %f0 -; NOVEC-NEXT: ler %f0, %f10 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r13) -; NOVEC-NEXT: lgdr %r0, %f11 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r13) -; NOVEC-NEXT: lgdr %r0, %f13 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r13) -; NOVEC-NEXT: lgdr %r0, %f15 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r13) -; NOVEC-NEXT: lgdr %r0, %f14 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r13) -; NOVEC-NEXT: lgdr %r0, %f12 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r13) -; NOVEC-NEXT: lgdr %r0, %f9 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r13) -; NOVEC-NEXT: lgdr %r0, %f8 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r13) -; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r13, %r15, 328(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun0: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) -; VECTOR-NEXT: .cfi_offset %r13, -56 -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -224 -; VECTOR-NEXT: .cfi_def_cfa_offset 384 -; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: .cfi_offset %f9, -176 -; VECTOR-NEXT: .cfi_offset %f10, -184 -; VECTOR-NEXT: .cfi_offset %f11, -192 -; VECTOR-NEXT: .cfi_offset %f12, -200 -; VECTOR-NEXT: .cfi_offset %f13, -208 -; VECTOR-NEXT: .cfi_offset %f14, -216 -; VECTOR-NEXT: .cfi_offset %f15, -224 -; VECTOR-NEXT: vlreph %v11, 414(%r15) -; VECTOR-NEXT: vlreph %v12, 406(%r15) -; VECTOR-NEXT: vlreph %v13, 398(%r15) -; VECTOR-NEXT: vlreph %v14, 390(%r15) -; VECTOR-NEXT: ldr %f8, %f6 -; VECTOR-NEXT: ldr %f9, %f4 -; VECTOR-NEXT: ldr %f10, %f2 -; VECTOR-NEXT: lgr %r13, %r2 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f15, %f0 -; VECTOR-NEXT: ldr %f0, %f10 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f10, %f0 -; VECTOR-NEXT: ldr %f0, %f9 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f9, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: ldr %f0, %f14 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f14, %f0 -; VECTOR-NEXT: ldr %f0, %f13 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f13, %f0 -; VECTOR-NEXT: ldr %f0, %f12 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f12, %f0 -; VECTOR-NEXT: ldr %f0, %f11 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: vsteh %v0, 14(%r13), 0 -; VECTOR-NEXT: vsteh %v12, 12(%r13), 0 -; VECTOR-NEXT: vsteh %v13, 10(%r13), 0 -; VECTOR-NEXT: vsteh %v14, 8(%r13), 0 -; VECTOR-NEXT: vsteh %v8, 6(%r13), 0 -; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 -; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 -; VECTOR-NEXT: vsteh %v15, 0(%r13), 0 -; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) -; VECTOR-NEXT: br %r14 -entry: - %Res = fadd <8 x half> %Op, %Op - ret <8 x half> %Res -} - -; Same, but with partial vector values. -define <4 x half> @fun1(<4 x half> %Op) { -; NOVEC-LABEL: fun1: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -192 -; NOVEC-NEXT: .cfi_def_cfa_offset 352 -; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: ler %f8, %f6 -; NOVEC-NEXT: ler %f9, %f4 -; NOVEC-NEXT: ler %f10, %f2 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f11, %f0 -; NOVEC-NEXT: ler %f0, %f10 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f10, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ler %f0, %f8 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f6, %f0 -; NOVEC-NEXT: ler %f0, %f11 -; NOVEC-NEXT: ler %f2, %f10 -; NOVEC-NEXT: ler %f4, %f9 -; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 304(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun1: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -192 -; VECTOR-NEXT: .cfi_def_cfa_offset 352 -; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: .cfi_offset %f9, -176 -; VECTOR-NEXT: .cfi_offset %f10, -184 -; VECTOR-NEXT: .cfi_offset %f11, -192 -; VECTOR-NEXT: ldr %f8, %f6 -; VECTOR-NEXT: ldr %f9, %f4 -; VECTOR-NEXT: ldr %f10, %f2 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f11, %f0 -; VECTOR-NEXT: ldr %f0, %f10 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f10, %f0 -; VECTOR-NEXT: ldr %f0, %f9 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f9, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f6, %f0 -; VECTOR-NEXT: ldr %f0, %f11 -; VECTOR-NEXT: ldr %f2, %f10 -; VECTOR-NEXT: ldr %f4, %f9 -; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) -; VECTOR-NEXT: br %r14 -entry: - %Res = fadd <4 x half> %Op, %Op - ret <4 x half> %Res -} - -; Test a vector extension. -define <2 x half> @fun2(<2 x half> %Op) { -; NOVEC-LABEL: fun2: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -176 -; NOVEC-NEXT: .cfi_def_cfa_offset 336 -; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: ler %f8, %f2 -; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT -; NOVEC-NEXT: ldr %f9, %f0 -; NOVEC-NEXT: ler %f0, %f8 -; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT -; NOVEC-NEXT: adbr %f9, %f9 -; NOVEC-NEXT: ldr %f8, %f0 -; NOVEC-NEXT: adbr %f8, %f0 -; NOVEC-NEXT: ldr %f0, %f9 -; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ldr %f0, %f8 -; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT -; NOVEC-NEXT: ler %f2, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun2: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -184 -; VECTOR-NEXT: .cfi_def_cfa_offset 344 -; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: ldr %f0, %f2 -; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT -; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 -; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT -; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload -; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 -; VECTOR-NEXT: vmrhg %v0, %v0, %v1 -; VECTOR-NEXT: vfadb %v0, %v0, %v0 -; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill -; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 -; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload -; VECTOR-NEXT: vrepg %v0, %v0, 1 -; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 -; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT -; VECTOR-NEXT: ldr %f2, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) -; VECTOR-NEXT: br %r14 -entry: - %E = fpext <2 x half> %Op to <2 x double> - %Add = fadd <2 x double> %E, %E - %Res = fptrunc <2 x double> %Add to <2 x half> - ret <2 x half> %Res -} - -; Load and store an <8 x half> vector. -define void @fun3(ptr %Src, ptr %Dst) { -; NOVEC-LABEL: fun3: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: lgh %r0, 0(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 2(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 4(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: lgh %r0, 6(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 8(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: lgh %r0, 10(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 12(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: lgh %r0, 14(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r3) -; NOVEC-NEXT: lgdr %r0, %f6 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r3) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r3) -; NOVEC-NEXT: lgdr %r0, %f4 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r3) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r3) -; NOVEC-NEXT: lgdr %r0, %f2 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r3) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r3) -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r3) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun3: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: vlreph %v0, 0(%r2) -; VECTOR-NEXT: vlreph %v1, 2(%r2) -; VECTOR-NEXT: vlreph %v2, 4(%r2) -; VECTOR-NEXT: vlreph %v3, 6(%r2) -; VECTOR-NEXT: vlreph %v4, 8(%r2) -; VECTOR-NEXT: vlreph %v5, 10(%r2) -; VECTOR-NEXT: vlreph %v6, 12(%r2) -; VECTOR-NEXT: vlreph %v7, 14(%r2) -; VECTOR-NEXT: vsteh %v7, 14(%r3), 0 -; VECTOR-NEXT: vsteh %v6, 12(%r3), 0 -; VECTOR-NEXT: vsteh %v5, 10(%r3), 0 -; VECTOR-NEXT: vsteh %v4, 8(%r3), 0 -; VECTOR-NEXT: vsteh %v3, 6(%r3), 0 -; VECTOR-NEXT: vsteh %v2, 4(%r3), 0 -; VECTOR-NEXT: vsteh %v1, 2(%r3), 0 -; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 -; VECTOR-NEXT: br %r14 -entry: - %L = load <8 x half>, ptr %Src - store <8 x half> %L, ptr %Dst - ret void -} - -; Call a function with <8 x half> argument and return values. -declare <8 x half> @foo(<8 x half>) -define void @fun4(ptr %Src, ptr %Dst) { -; NOVEC-LABEL: fun4: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) -; NOVEC-NEXT: .cfi_offset %r13, -56 -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -208 -; NOVEC-NEXT: .cfi_def_cfa_offset 368 -; NOVEC-NEXT: lgh %r0, 0(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 2(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d -; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d -; NOVEC-NEXT: lgh %r0, 4(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d -; NOVEC-NEXT: lgh %r0, 6(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d -; NOVEC-NEXT: lgh %r0, 8(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 10(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 12(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 14(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 190(%r15) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 182(%r15) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 174(%r15) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: la %r2, 192(%r15) -; NOVEC-NEXT: lgr %r13, %r3 -; NOVEC-NEXT: sth %r0, 166(%r15) -; NOVEC-NEXT: brasl %r14, foo@PLT -; NOVEC-NEXT: lgh %r0, 192(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 194(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 196(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: lgh %r0, 198(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 200(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: lgh %r0, 202(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 204(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: lgh %r0, 206(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r13) -; NOVEC-NEXT: lgdr %r0, %f6 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r13) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r13) -; NOVEC-NEXT: lgdr %r0, %f4 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r13) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r13) -; NOVEC-NEXT: lgdr %r0, %f2 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r13) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r13) -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r13) -; NOVEC-NEXT: lmg %r13, %r15, 312(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun4: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) -; VECTOR-NEXT: .cfi_offset %r13, -56 -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -208 -; VECTOR-NEXT: .cfi_def_cfa_offset 368 -; VECTOR-NEXT: vlreph %v6, 6(%r2) -; VECTOR-NEXT: vlreph %v4, 4(%r2) -; VECTOR-NEXT: vlreph %v2, 2(%r2) -; VECTOR-NEXT: vlreph %v0, 0(%r2) -; VECTOR-NEXT: vlreph %v1, 8(%r2) -; VECTOR-NEXT: vlreph %v3, 10(%r2) -; VECTOR-NEXT: vlreph %v5, 12(%r2) -; VECTOR-NEXT: vlreph %v7, 14(%r2) -; VECTOR-NEXT: la %r2, 192(%r15) -; VECTOR-NEXT: lgr %r13, %r3 -; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 -; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 -; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 -; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 -; VECTOR-NEXT: brasl %r14, foo@PLT -; VECTOR-NEXT: vlreph %v0, 192(%r15) -; VECTOR-NEXT: vlreph %v1, 194(%r15) -; VECTOR-NEXT: vlreph %v2, 196(%r15) -; VECTOR-NEXT: vlreph %v3, 198(%r15) -; VECTOR-NEXT: vlreph %v4, 200(%r15) -; VECTOR-NEXT: vlreph %v5, 202(%r15) -; VECTOR-NEXT: vlreph %v6, 204(%r15) -; VECTOR-NEXT: vlreph %v7, 206(%r15) -; VECTOR-NEXT: vsteh %v7, 14(%r13), 0 -; VECTOR-NEXT: vsteh %v6, 12(%r13), 0 -; VECTOR-NEXT: vsteh %v5, 10(%r13), 0 -; VECTOR-NEXT: vsteh %v4, 8(%r13), 0 -; VECTOR-NEXT: vsteh %v3, 6(%r13), 0 -; VECTOR-NEXT: vsteh %v2, 4(%r13), 0 -; VECTOR-NEXT: vsteh %v1, 2(%r13), 0 -; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 -; VECTOR-NEXT: lmg %r13, %r15, 312(%r15) -; VECTOR-NEXT: br %r14 -entry: - %arg = load <8 x half>, ptr %Src - %Res = call <8 x half> @foo(<8 x half> %arg) - store <8 x half> %Res, ptr %Dst - ret void -} - -; Receive and pass argument fully on stack. -declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5) -define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) { -; NOVEC-LABEL: fun5: -; NOVEC: # %bb.0: -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -256 -; NOVEC-NEXT: .cfi_def_cfa_offset 416 -; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: lgh %r0, 422(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 430(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 438(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 446(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgh %r0, 454(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f8, %r0 -; NOVEC-NEXT: lgh %r0, 462(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f9, %r0 -; NOVEC-NEXT: lgh %r0, 470(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f10, %r0 -; NOVEC-NEXT: lgh %r0, 478(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f11, %r0 -; NOVEC-NEXT: lgdr %r0, %f11 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 222(%r15) -; NOVEC-NEXT: lgdr %r0, %f10 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 214(%r15) -; NOVEC-NEXT: lgdr %r0, %f9 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 206(%r15) -; NOVEC-NEXT: lgdr %r0, %f8 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 198(%r15) -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 190(%r15) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 182(%r15) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 174(%r15) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 166(%r15) -; NOVEC-NEXT: brasl %r14, foo2@PLT -; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 368(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun5: -; VECTOR: # %bb.0: -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -224 -; VECTOR-NEXT: .cfi_def_cfa_offset 384 -; VECTOR-NEXT: vlreph %v1, 390(%r15) -; VECTOR-NEXT: vlreph %v3, 398(%r15) -; VECTOR-NEXT: vlreph %v5, 406(%r15) -; VECTOR-NEXT: vlreph %v7, 414(%r15) -; VECTOR-NEXT: vlreph %v16, 422(%r15) -; VECTOR-NEXT: vlreph %v17, 430(%r15) -; VECTOR-NEXT: vlreph %v18, 438(%r15) -; VECTOR-NEXT: vlreph %v19, 446(%r15) -; VECTOR-NEXT: vsteh %v19, 222(%r15), 0 -; VECTOR-NEXT: vsteh %v18, 214(%r15), 0 -; VECTOR-NEXT: vsteh %v17, 206(%r15), 0 -; VECTOR-NEXT: vsteh %v16, 198(%r15), 0 -; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 -; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 -; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 -; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 -; VECTOR-NEXT: brasl %r14, foo2@PLT -; VECTOR-NEXT: lmg %r14, %r15, 336(%r15) -; VECTOR-NEXT: br %r14 - call void @foo2(<4 x half> %dummy, <8 x half> %Arg5) - ret void -}