diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 2a36f3dea34ce..d6298c4ebf24a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -149,6 +149,12 @@ def FeatureFmaMixInsts : SubtargetFeature<"fma-mix-insts", "Has v_fma_mix_f32, v_fma_mixlo_f16, v_fma_mixhi_f16 instructions" >; +def FeatureFmaMixBF16Insts : SubtargetFeature<"fma-mix-bf16-insts", + "HasFmaMixBF16Insts", + "true", + "Has v_fma_mix_f32_bf16, v_fma_mixlo_bf16, v_fma_mixhi_bf16 instructions" +>; + def FeatureIEEEMinimumMaximumInsts : SubtargetFeature<"ieee-minimum-maximum-insts", "HasIEEEMinimumMaximumInsts", "true", @@ -2007,6 +2013,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureBF16ConversionInsts, FeatureBF16PackedInsts, FeatureCvtPkF16F32Inst, + FeatureFmaMixBF16Insts, FeatureMin3Max3PKF16, FeatureMinimum3Maximum3PKF16, FeaturePrngInst, @@ -2599,6 +2606,9 @@ def HasMovrel : Predicate<"Subtarget->hasMovrel()">, def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">, AssemblerPredicate<(all_of FeatureFmaMixInsts)>; +def HasFmaMixBF16Insts : Predicate<"Subtarget->hasFmaMixBF16Insts()">, + AssemblerPredicate<(all_of FeatureFmaMixBF16Insts)>; + def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">, AssemblerPredicate<(all_of FeatureDLInsts)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 5a2416debb417..0ca2286c11c94 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3861,58 +3861,114 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src, return SelectVOP3Mods(In, Src, SrcMods); } +// Match lowered fpext from bf16 to f32. This is a bit operation extending +// a 16-bit value with 16-bit of zeroes at LSB: +// +// 1. (f32 (bitcast (build_vector (i16 0), (i16 (bitcast bf16:val))))) +// 2. (f32 (bitcast (and i32:val, 0xffff0000))) -> IsExtractHigh = true +// 3. (f32 (bitcast (shl i32:va, 16) -> IsExtractHigh = false +static SDValue matchBF16FPExtendLike(SDValue Op, bool &IsExtractHigh) { + if (Op.getValueType() != MVT::f32 || Op.getOpcode() != ISD::BITCAST) + return SDValue(); + Op = Op.getOperand(0); + + IsExtractHigh = false; + if (Op.getValueType() == MVT::v2i16 && Op.getOpcode() == ISD::BUILD_VECTOR) { + auto Low16 = dyn_cast(Op.getOperand(0)); + if (!Low16 || !Low16->isZero()) + return SDValue(); + Op = stripBitcast(Op.getOperand(1)); + if (Op.getValueType() != MVT::bf16) + return SDValue(); + return Op; + } + + if (Op.getValueType() != MVT::i32) + return SDValue(); + + if (Op.getOpcode() == ISD::AND) { + if (auto Mask = dyn_cast(Op.getOperand(1))) { + if (Mask->getZExtValue() == 0xffff0000) { + IsExtractHigh = true; + return Op.getOperand(0); + } + } + return SDValue(); + } + + if (Op.getOpcode() == ISD::SHL) { + if (auto Amt = dyn_cast(Op.getOperand(1))) { + if (Amt->getZExtValue() == 16) + return Op.getOperand(0); + } + } + + return SDValue(); +} + // The return value is not whether the match is possible (which it always is), // but whether or not it a conversion is really used. bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const { + unsigned &Mods, + MVT VT) const { Mods = 0; SelectVOP3ModsImpl(In, Src, Mods); + bool IsExtractHigh = false; if (Src.getOpcode() == ISD::FP_EXTEND) { Src = Src.getOperand(0); - assert(Src.getValueType() == MVT::f16); - Src = stripBitcast(Src); + } else if (VT == MVT::bf16) { + SDValue B16 = matchBF16FPExtendLike(Src, IsExtractHigh); + if (!B16) + return false; + Src = B16; + } else + return false; - // Be careful about folding modifiers if we already have an abs. fneg is - // applied last, so we don't want to apply an earlier fneg. - if ((Mods & SISrcMods::ABS) == 0) { - unsigned ModsTmp; - SelectVOP3ModsImpl(Src, Src, ModsTmp); + if (Src.getValueType() != VT && + (VT != MVT::bf16 || Src.getValueType() != MVT::i32)) + return false; - if ((ModsTmp & SISrcMods::NEG) != 0) - Mods ^= SISrcMods::NEG; + Src = stripBitcast(Src); - if ((ModsTmp & SISrcMods::ABS) != 0) - Mods |= SISrcMods::ABS; - } + // Be careful about folding modifiers if we already have an abs. fneg is + // applied last, so we don't want to apply an earlier fneg. + if ((Mods & SISrcMods::ABS) == 0) { + unsigned ModsTmp; + SelectVOP3ModsImpl(Src, Src, ModsTmp); + + if ((ModsTmp & SISrcMods::NEG) != 0) + Mods ^= SISrcMods::NEG; - // op_sel/op_sel_hi decide the source type and source. - // If the source's op_sel_hi is set, it indicates to do a conversion from fp16. - // If the sources's op_sel is set, it picks the high half of the source - // register. + if ((ModsTmp & SISrcMods::ABS) != 0) + Mods |= SISrcMods::ABS; + } - Mods |= SISrcMods::OP_SEL_1; - if (isExtractHiElt(Src, Src)) { - Mods |= SISrcMods::OP_SEL_0; + // op_sel/op_sel_hi decide the source type and source. + // If the source's op_sel_hi is set, it indicates to do a conversion from + // fp16. If the sources's op_sel is set, it picks the high half of the source + // register. - // TODO: Should we try to look for neg/abs here? - } + Mods |= SISrcMods::OP_SEL_1; + if (IsExtractHigh || + (Src.getValueSizeInBits() == 16 && isExtractHiElt(Src, Src))) { + Mods |= SISrcMods::OP_SEL_0; - // Prevent unnecessary subreg COPY to VGPR_16 - if (Src.getOpcode() == ISD::TRUNCATE && - Src.getOperand(0).getValueType() == MVT::i32) { - Src = Src.getOperand(0); - } - return true; + // TODO: Should we try to look for neg/abs here? } - return false; + // Prevent unnecessary subreg COPY to VGPR_16 + if (Src.getOpcode() == ISD::TRUNCATE && + Src.getOperand(0).getValueType() == MVT::i32) { + Src = Src.getOperand(0); + } + return true; } bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - if (!SelectVOP3PMadMixModsImpl(In, Src, Mods)) + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16)) return false; SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; @@ -3921,7 +3977,24 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const { unsigned Mods = 0; - SelectVOP3PMadMixModsImpl(In, Src, Mods); + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::f16); + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + if (!SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16)) + return false; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const { + unsigned Mods = 0; + SelectVOP3PMadMixModsImpl(In, Src, Mods, MVT::bf16); SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 6123d75d7b616..7ecba1e24ff51 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -254,11 +254,15 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; - bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, - unsigned &Mods) const; + bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods, + MVT VT) const; bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16ModsExt(SDValue In, SDValue &Src, + SDValue &SrcMods) const; + bool SelectVOP3PMadMixBF16Mods(SDValue In, SDValue &Src, + SDValue &SrcMods) const; bool SelectBITOP3(SDValue In, SDValue &Src0, SDValue &Src1, SDValue &Src2, SDValue &Tbl) const; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 0435e7f9e51d2..0683f02955594 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -123,6 +123,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasSMemRealTime = false; bool HasIntClamp = false; bool HasFmaMixInsts = false; + bool HasFmaMixBF16Insts = false; bool HasMovrel = false; bool HasVGPRIndexMode = false; bool HasScalarDwordx3Loads = false; @@ -462,6 +463,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasFmaMixInsts; } + bool hasFmaMixBF16Insts() const { return HasFmaMixBF16Insts; } + bool hasCARRY() const { return true; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index f1a8ee118356e..6d963b77850f4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1061,10 +1061,12 @@ ArrayRef SITargetLowering::getRoundingControlRegisters() const { // where this is OK to use. bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const { - return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || - (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && - SrcVT.getScalarType() == MVT::f16 && + return DestVT.getScalarType() == MVT::f32 && + ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && + SrcVT.getScalarType() == MVT::f16) || + (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() && + SrcVT.getScalarType() == MVT::bf16)) && // TODO: This probably only requires no input flushing? denormalModeIsFlushAllF32(DAG.getMachineFunction()); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 485ca78db93a7..b0be3f864b94d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1662,6 +1662,8 @@ def VOP3OpSelMods : ComplexPattern; def VOP3PMadMixModsExt : ComplexPattern; def VOP3PMadMixMods : ComplexPattern; +def VOP3PMadMixBF16ModsExt : ComplexPattern; +def VOP3PMadMixBF16Mods : ComplexPattern; def VINTERPMods : ComplexPattern; def VINTERPModsHi : ComplexPattern; @@ -2866,6 +2868,7 @@ def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], /*EnableClamp= def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>; def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>; +def VOP_BF16_BF16_BF16_BF16 : VOPProfile <[bf16, bf16, bf16, bf16, untyped]>; def VOP_I32_I16_I16_I32 : VOPProfile <[i32, i16, i16, i32, untyped]>; def VOP_I32_I16 : VOPProfile <[i32, i16, untyped, untyped]>; @@ -2917,6 +2920,7 @@ def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp= def VOP_I64_I64_I64_ARITH : VOPProfile <[i64, i64, i64, untyped], /*EnableClamp=*/1>; def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>; def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>; +def VOP_F32_BF16_BF16_BF16 : VOPProfile <[f32, bf16, bf16, bf16]>; def VOP_V2BF16_F32_F32 : VOPProfile <[v2bf16, f32, f32, untyped]>; def VOP_V32F32_V6I32_F32 : VOPProfile <[v32f32, v6i32, f32, untyped]>; def VOP_V32F16_V6I32_F32 : VOPProfile <[v32f16, v6i32, f32, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index ea14c77cdff0b..7017da9dc3521 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -35,14 +35,18 @@ class VOP3P_Mix_Profile : VOP3P_Profile { bit UseTiedOutput = useTiedOutput; + defvar Src0RC = getVCSrcForVT.ret; + defvar Src1RC = getVCSrcForVT.ret; + defvar Src2RC = getVCSrcForVT.ret; + dag srcs = - (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0, - FP16InputMods:$src1_modifiers, VCSrc_f16:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + (ins FP16InputMods:$src0_modifiers, Src0RC:$src0, + FP16InputMods:$src1_modifiers, Src1RC:$src1, + FP16InputMods:$src2_modifiers, Src2RC:$src2); dag dpp_srcs = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1, - FP16InputMods:$src2_modifiers, VCSrc_f16:$src2); + FP16InputMods:$src2_modifiers, Src2RC:$src2); // FIXME: Clamp0 misbehaves with the non-default vdst_in // following it. For now workaround this by requiring clamp @@ -161,38 +165,42 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile { + Instruction mixhi_inst, + ValueType VT = f16, + ValueType vecVT = v2f16> { + defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods); + defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt); // At least one of the operands needs to be an fpextend of an f16 // for this to be worthwhile, so we need three patterns here. // TODO: Could we use a predicate to inspect src1/2/3 instead? def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsExtPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsPat f32:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < - (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)), - (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))), + (f32 (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_mods)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_mods)), + (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))), (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE)>; def : GCNPat < (AMDGPUclamp (build_vector - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))), - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))), - (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0, + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))), + (vecVT (mixhi_inst $hi_src0_modifiers, $hi_src0, $hi_src1_modifiers, $hi_src1, $hi_src2_modifiers, $hi_src2, DSTCLAMP.ENABLE, @@ -204,8 +212,8 @@ multiclass MadFmaMixPats; def : GCNPat < - (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))), + (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), @@ -214,9 +222,9 @@ multiclass MadFmaMixPats; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, (i32 0), (i32 0), DSTCLAMP.NONE, @@ -224,9 +232,9 @@ multiclass MadFmaMixPats; def : GCNPat < - (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), + (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), (mixlo_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, @@ -241,10 +249,10 @@ multiclass MadFmaMixPats; def : GCNPat < - (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.NONE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; def : GCNPat < (build_vector - f16:$elt0, - (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)), - (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)), - (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))), - (v2f16 (mixhi_inst $src0_modifiers, $src0, + VT:$elt0, + (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)), + (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))), + (vecVT (mixhi_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, DSTCLAMP.ENABLE, - (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16))) + (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16))) >; } // end True16Predicate } @@ -360,6 +368,24 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile; } +let SubtargetPredicate = HasFmaMixBF16Insts in { +let isCommutable = 1 in { + +let isReMaterializable = 1 in +defm V_FMA_MIX_F32_BF16 : VOP3_VOP3PInst<"v_fma_mix_f32_bf16", VOP3P_Mix_Profile>; + +let FPDPRounding = 1 in { +defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile>; + +let ClampLo = 0, ClampHi = 1 in { +defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile>; +} +} // End FPDPRounding = 1 +} // End isCommutable = 1 + +defm : MadFmaMixPats; +} // End SubtargetPredicate = HasFmaMixBF16Insts + def PK_ADD_MINMAX_Profile : VOP3P_Profile { let HasModifiers = 0; } @@ -2247,6 +2273,10 @@ defm V_PK_MAXIMUM3_F16 : VOP3P_Real_gfx1250<0x37>; defm V_PK_MIN3_NUM_F16 : VOP3P_Real_gfx1250<0x38>; defm V_PK_MAX3_NUM_F16 : VOP3P_Real_gfx1250<0x39>; +defm V_FMA_MIX_F32_BF16 : VOP3P_Realtriple; +defm V_FMA_MIXLO_BF16 : VOP3P_Realtriple; +defm V_FMA_MIXHI_BF16 : VOP3P_Realtriple; + defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>; defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>; diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll new file mode 100644 index 0000000000000..11cda2d4171ed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll @@ -0,0 +1,634 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s + +define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int(i32 %src0, i32 %src1, i32 %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_int: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = lshr i32 %src0, 16 + %src1.hi = lshr i32 %src1, 16 + %src2.hi = lshr i32 %src2, 16 + %src0.i16 = trunc i32 %src0.hi to i16 + %src1.i16 = trunc i32 %src1.hi to i16 + %src2.i16 = trunc i32 %src2.hi to i16 + %src0.fp16 = bitcast i16 %src0.i16 to bfloat + %src1.fp16 = bitcast i16 %src1.i16 to bfloat + %src2.fp16 = bitcast i16 %src2.i16 to bfloat + %src0.ext = fpext bfloat %src0.fp16 to float + %src1.ext = fpext bfloat %src1.fp16 to float + %src2.ext = fpext bfloat %src2.fp16 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16hi_bf16hi_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src2.hi = extractelement <2 x bfloat> %src2, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %src2.ext = fpext bfloat %src2.hi to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_shuffle(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_shuffle: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v5, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff0000, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.shuf = shufflevector <2 x bfloat> %src0, <2 x bfloat> undef, <2 x i32> + %src1.shuf = shufflevector <2 x bfloat> %src1, <2 x bfloat> undef, <2 x i32> + %src2.shuf = shufflevector <2 x bfloat> %src2, <2 x bfloat> undef, <2 x i32> + %src0.ext = fpext <2 x bfloat> %src0.shuf to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1.shuf to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2.shuf to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + ret <2 x float> %result +} + +define float @v_mad_mix_f32_negbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_negbf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %src0.ext.neg = fneg float %src0.ext + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_absbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_absbf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_negabsbf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src0.ext.neg.abs = fneg float %src0.ext.abs + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.neg.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_negf32(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.neg = fneg float %src2 + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_absf32(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_absf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, |v2| op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.abs = call float @llvm.fabs.f32(float %src2) + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.abs) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_negabsf32(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_negabsf32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, -|v2| op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.abs = call float @llvm.fabs.f32(float %src2) + %src2.neg.abs = fneg float %src2.abs + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.neg.abs) + ret float %result +} + + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32imm1(bfloat %src0, bfloat %src1) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imm1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 1.0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 1.0) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi(bfloat %src0, bfloat %src1) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32imminv2pi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0.15915494 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float 0x3FC45F3060000000) + ret float %result +} + + +define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi(bfloat %src0, bfloat %src1) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imminv2pi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2 = fpext bfloat 0xR3e23 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + + +define float @v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63(bfloat %src0, bfloat %src1) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_cvtbf16imm63: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s0, 0x367c0000 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, s0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2 = fpext bfloat 0xR367c to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_f32imm1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 1.0 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_cvtbf16imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_cvtbf16imminv2pi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_mov_b32 s0, 0x3e230000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], s[0:1] op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2 = fpext <2 x bfloat> to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2) + ret <2 x float> %result +} + +define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x bfloat> %src0, <2 x bfloat> %src1) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_f32imminv2pi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v4, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[2:3], v[4:5], 0.15915494 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2 = fpext <2 x bfloat> to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> ) + ret <2 x float> %result +} + +define float @v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_clamp_f32_bf16hi_bf16hi_bf16hi_elt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.hi = extractelement <2 x bfloat> %src0, i32 1 + %src1.hi = extractelement <2 x bfloat> %src1, i32 1 + %src2.hi = extractelement <2 x bfloat> %src2, i32 1 + %src0.ext = fpext bfloat %src0.hi to float + %src1.ext = fpext bfloat %src1.hi to float + %src2.ext = fpext bfloat %src2.hi to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + ret float %clamp +} + +define float @no_mix_simple(float %src0, float %src1, float %src2) #0 { +; GFX1250-LABEL: no_mix_simple: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + ret float %result +} + +define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 { +; GFX1250-LABEL: no_mix_simple_fabs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.fabs = call float @llvm.fabs.f32(float %src0) + %result = call float @llvm.fmuladd.f32(float %src0.fabs, float %src1, float %src2) + ret float %result +} + + +define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, bfloat %src2) #1 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals(bfloat %src0, bfloat %src1, float %src2) #1 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #1 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_denormals_fmulfadd: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_mul_f32 v0, v0, v1 +; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %mul = fmul float %src0.ext, %src1.ext + %result = fadd float %mul, %src2.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #1 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_denormals_fmulfadd: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX1250-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %mul = fmul float %src0.ext, %src1.ext + %result = fadd float %mul, %src2 + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_f32_flush_fmulfadd: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2.ext + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_f32_flush_fmulfadd: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %mul = fmul contract float %src0.ext, %src1.ext + %result = fadd contract float %mul, %src2 + ret float %result +} + +define float @v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_negprecvtbf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 0 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + + +define float @v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_precvtnegbf16hi_abs_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.neg = fneg bfloat %src0 + %src0.ext = fpext bfloat %src0.neg to float + %src0.ext.abs = call float @llvm.fabs.f32(float %src0.ext) + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext.abs, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_precvtabsbf16hi_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %src0 = extractelement <2 x bfloat> %src0.arg.bc, i32 1 + %src0.abs = call bfloat @llvm.fabs.bf16(bfloat %src0) + %src0.ext = fpext bfloat %src0.abs to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_preextractfneg_bf16hi_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fneg = fneg <2 x bfloat> %src0.arg.bc + %src0 = extractelement <2 x bfloat> %fneg, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabs_bf16hi_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, |v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %src0 = extractelement <2 x bfloat> %fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo(i32 %src0.arg, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_preextractfabsfneg_bf16hi_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, -|v0|, v1, v2 op_sel:[1,0,0] op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.arg.bc = bitcast i32 %src0.arg to <2 x bfloat> + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %src0.arg.bc) + %fneg.fabs = fneg <2 x bfloat> %fabs + %src0 = extractelement <2 x bfloat> %fneg.fabs, i32 1 + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half(half %src0, half %src1, half %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_bf16lo_bf16lo_all_cast_from_half: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v3, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_fmac_f32_e32 v0, v3, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.bf16 = bitcast half %src0 to bfloat + %src1.bf16 = bitcast half %src1 to bfloat + %src2.bf16 = bitcast half %src2 to bfloat + %src0.ext = fpext bfloat %src0.bf16 to float + %src1.ext = fpext bfloat %src1.bf16 to float + %src2.ext = fpext bfloat %src2.bf16 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[0,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.bf16 = bitcast half %src0 to bfloat + %src0.ext = fpext bfloat %src0.bf16 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + ret float %result +} + +define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo: +; GFX1250: ; %bb.0: ; %entry +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1] +; GFX1250-NEXT: s_mov_b32 s0, 0 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0 +; GFX1250-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[2:3] +; GFX1250-NEXT: s_endpgm +entry: + %v0 = shl i32 %y, 16 + %v1 = bitcast i32 %v0 to float + %mul7 = fmul contract float %x, 0.000000e+00 + %add2 = fadd contract float %mul7, %v1 + %v2 = fcmp uno float %add2, 0.000000e+00 + %v3 = select i1 %v2, i64 1, i64 0 + store i64 %v3, ptr addrspace(1) %out, align 8 + ret void +} + +declare bfloat @llvm.fabs.bf16(bfloat) #2 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #2 +declare float @llvm.fabs.f32(float) #2 +declare float @llvm.minnum.f32(float, float) #2 +declare float @llvm.maxnum.f32(float, float) #2 +declare float @llvm.fmuladd.f32(float, float, float) #2 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #2 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll new file mode 100644 index 0000000000000..5b2de59f7d271 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1 + ret <2 x bfloat> %vec.result +} + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_constlo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %vec.result = insertelement <2 x bfloat> , bfloat %cvt.result, i32 1 + ret <2 x bfloat> %vec.result +} + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo(bfloat %src0, bfloat %src1, bfloat %src2, bfloat %lo) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_reglo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixhi_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v0, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %vec = insertelement <2 x bfloat> undef, bfloat %lo, i32 0 + %vec.result = insertelement <2 x bfloat> %vec, bfloat %cvt.result, i32 1 + ret <2 x bfloat> %vec.result +} + +define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %bc = bitcast bfloat %cvt.result to i16 + %ext = zext i16 %bc to i32 + %shr = shl i32 %ext, 16 + ret i32 %shr +} + +define i32 @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_intpack_sext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %bc = bitcast bfloat %cvt.result to i16 + %ext = sext i16 %bc to i32 + %shr = shl i32 %ext, 16 + ret i32 %shr +} + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_precvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %cvt.result = fptrunc float %clamp to bfloat + %vec.result = insertelement <2 x bfloat> undef, bfloat %cvt.result, i32 1 + ret <2 x bfloat> %vec.result +} + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1 + ret <2 x bfloat> %vec.result +} + +define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt_multi_use: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1 +; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + store volatile bfloat %cvt.result, ptr addrspace(1) undef + %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + %vec.result = insertelement <2 x bfloat> undef, bfloat %clamp, i32 1 + ret <2 x bfloat> %vec.result +} + +declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1 +declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1 +declare float @llvm.minnum.f32(float, float) #1 +declare float @llvm.maxnum.f32(float, float) #1 +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll new file mode 100644 index 0000000000000..557080a61041c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll @@ -0,0 +1,540 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s + +define bfloat @mixlo_simple(float %src0, float %src1, float %src2) #0 { +; GFX1250-LABEL: mixlo_simple: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + %cvt.result = fptrunc float %result to bfloat + ret bfloat %cvt.result +} + +define bfloat @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) { +; GFX1250-LABEL: mixlo_simpl_no_flush: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + %cvt.result = fptrunc float %result to bfloat + ret bfloat %cvt.result +} + +define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo(bfloat %src0, bfloat %src1, bfloat %src2) #0 { +; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + ret bfloat %cvt.result +} + +define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush(bfloat %src0, bfloat %src1, bfloat %src2) { +; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_bf16lo_no_flush: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %src2.ext = fpext bfloat %src2 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext) + %cvt.result = fptrunc float %result to bfloat + ret bfloat %cvt.result +} + +define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %cvt.result = fptrunc float %result to bfloat + ret bfloat %cvt.result +} + +define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %cvt.result = fptrunc float %result to bfloat + %max = call bfloat @llvm.maxnum.bf16(bfloat %cvt.result, bfloat 0.0) + %clamp = call bfloat @llvm.minnum.bf16(bfloat %max, bfloat 1.0) + ret bfloat %clamp +} + +define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt(bfloat %src0, bfloat %src1, float %src2) #0 { +; GFX1250-LABEL: v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_pre_cvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0] clamp +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext bfloat %src0 to float + %src1.ext = fpext bfloat %src1 to float + %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2) + %max = call float @llvm.maxnum.f32(float %result, float 0.0) + %clamp = call float @llvm.minnum.f32(float %max, float 1.0) + %cvt.result = fptrunc float %clamp to bfloat + ret bfloat %cvt.result +} + + +define <2 x bfloat> @v_mad_mix_v2f32(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x bfloat> + ret <2 x bfloat> %cvt.result +} + +define <3 x bfloat> @v_mad_mix_v3f32(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_mov_b32_e32 v0, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> + %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> + %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %cvt.result = fptrunc <3 x float> %result to <3 x bfloat> + ret <3 x bfloat> %cvt.result +} + +define <4 x bfloat> @v_mad_mix_v4f32(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v4f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_lshlrev_b32_e32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v3 +; GFX1250-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v5 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v4 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v4 :: v_dual_lshlrev_b32 v10, 16, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[12:13] +; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[6:7], v[8:9], v[10:11] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <4 x bfloat> %src0 to <4 x float> + %src1.ext = fpext <4 x bfloat> %src1 to <4 x float> + %src2.ext = fpext <4 x bfloat> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %cvt.result = fptrunc <4 x float> %result to <4 x bfloat> + ret <4 x bfloat> %cvt.result +} + + +define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x bfloat> + %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %cvt.result, <2 x bfloat> zeroinitializer) + %clamp = call <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat> %max, <2 x bfloat> ) + ret <2 x bfloat> %clamp +} + + +define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v3f32_clamp_postcvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0 +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> + %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> + %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %cvt.result = fptrunc <3 x float> %result to <3 x bfloat> + %max = call <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat> %cvt.result, <3 x bfloat> zeroinitializer) + %clamp = call <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat> %max, <3 x bfloat> ) + ret <3 x bfloat> %clamp +} + +define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v4f32_clamp_postcvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX1250-NEXT: v_dual_lshlrev_b32 v2, 16, v4 :: v_dual_lshlrev_b32 v12, 16, v5 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3] +; GFX1250-NEXT: v_pk_fma_f32 v[2:3], v[8:9], v[10:11], v[12:13] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0 +; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <4 x bfloat> %src0 to <4 x float> + %src1.ext = fpext <4 x bfloat> %src1 to <4 x float> + %src2.ext = fpext <4 x bfloat> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %cvt.result = fptrunc <4 x float> %result to <4 x bfloat> + %max = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %cvt.result, <4 x bfloat> zeroinitializer) + %clamp = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %max, <4 x bfloat> ) + ret <4 x bfloat> %clamp +} + +define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_lo: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x bfloat> + %cvt.lo = extractelement <2 x bfloat> %cvt.result, i32 0 + %max.lo = call bfloat @llvm.maxnum.bf16(bfloat %cvt.lo, bfloat 0.0) + %clamp.lo = call bfloat @llvm.minnum.bf16(bfloat %max.lo, bfloat 1.0) + %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.lo, i32 0 + ret <2 x bfloat> %insert +} + +define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_clamp_postcvt_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %cvt.result = fptrunc <2 x float> %result to <2 x bfloat> + %cvt.hi = extractelement <2 x bfloat> %cvt.result, i32 1 + %max.hi = call bfloat @llvm.maxnum.bf16(bfloat %cvt.hi, bfloat 0.0) + %clamp.hi = call bfloat @llvm.minnum.bf16(bfloat %max.hi, bfloat 1.0) + %insert = insertelement <2 x bfloat> %cvt.result, bfloat %clamp.hi, i32 1 + ret <2 x bfloat> %insert +} + + +define <2 x bfloat> @v_mad_mix_v2f32_clamp_precvt(<2 x bfloat> %src0, <2 x bfloat> %src1, <2 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v2f32_clamp_precvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v4, 16, v0 :: v_dual_lshlrev_b32 v6, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1] +; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <2 x bfloat> %src0 to <2 x float> + %src1.ext = fpext <2 x bfloat> %src1 to <2 x float> + %src2.ext = fpext <2 x bfloat> %src2 to <2 x float> + %result = tail call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %src0.ext, <2 x float> %src1.ext, <2 x float> %src2.ext) + %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %result, <2 x float> zeroinitializer) + %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> ) + %cvt.result = fptrunc <2 x float> %clamp to <2 x bfloat> + ret <2 x bfloat> %cvt.result +} + + +define <3 x bfloat> @v_mad_mix_v3f32_clamp_precvt(<3 x bfloat> %src0, <3 x bfloat> %src1, <3 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v3f32_clamp_precvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mix_f32_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: v_fma_mix_f32_bf16 v0, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: v_fma_mix_f32_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0 +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <3 x bfloat> %src0 to <3 x float> + %src1.ext = fpext <3 x bfloat> %src1 to <3 x float> + %src2.ext = fpext <3 x bfloat> %src2 to <3 x float> + %result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext) + %max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer) + %clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> ) + %cvt.result = fptrunc <3 x float> %clamp to <3 x bfloat> + ret <3 x bfloat> %cvt.result +} + +define <4 x bfloat> @v_mad_mix_v4f32_clamp_precvt(<4 x bfloat> %src0, <4 x bfloat> %src1, <4 x bfloat> %src2) #0 { +; GFX1250-LABEL: v_mad_mix_v4f32_clamp_precvt: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX1250-NEXT: v_dual_lshlrev_b32 v6, 16, v0 :: v_dual_lshlrev_b32 v8, 16, v1 +; GFX1250-NEXT: v_and_b32_e32 v9, 0xffff0000, v1 +; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2 +; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v10, 16, v3 +; GFX1250-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 +; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v4 +; GFX1250-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 +; GFX1250-NEXT: v_dual_lshlrev_b32 v12, 16, v5 :: v_dual_lshlrev_b32 v2, 16, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_pk_fma_f32 v[4:5], v[8:9], v[10:11], v[12:13] +; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[6:7], v[0:1], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_num_f32_e64 v2, v5, v5 clamp +; GFX1250-NEXT: v_max_num_f32_e64 v1, v1, v1 clamp +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_max_num_f32_e64 v0, v0, v0 clamp +; GFX1250-NEXT: v_max_num_f32_e64 v3, v4, v4 clamp +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v3, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %src0.ext = fpext <4 x bfloat> %src0 to <4 x float> + %src1.ext = fpext <4 x bfloat> %src1 to <4 x float> + %src2.ext = fpext <4 x bfloat> %src2 to <4 x float> + %result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext) + %max = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %result, <4 x float> zeroinitializer) + %clamp = call <4 x float> @llvm.minnum.v4f32(<4 x float> %max, <4 x float> ) + %cvt.result = fptrunc <4 x float> %clamp to <4 x bfloat> + ret <4 x bfloat> %cvt.result +} + +define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { +; GFX1250-LABEL: mixlo_zext: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] + %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2) + %cvt.result = fptrunc float %result to bfloat + %cvt.result.i16 = bitcast bfloat %cvt.result to i16 + %cvt.result.i32 = zext i16 %cvt.result.i16 to i32 + ret i32 %cvt.result.i32 +} + +define bfloat @mixlo_fptrunc(float %a, float %b) #0 { +; GFX1250-LABEL: mixlo_fptrunc: +; GFX1250: ; %bb.0: ; %.entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +.entry: + %mul = fmul float %a, %b + %trunc = fptrunc float %mul to bfloat + ret bfloat %trunc +} + +define bfloat @mixlo_fptrunc_no_flush(float %a, float %b) { +; GFX1250-LABEL: mixlo_fptrunc_no_flush: +; GFX1250: ; %bb.0: ; %.entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +.entry: + %mul = fmul float %a, %b + %trunc = fptrunc float %mul to bfloat + ret bfloat %trunc +} + +define bfloat @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 { +; GFX1250-LABEL: mixlo_fptrunc_abs_src_mod: +; GFX1250: ; %bb.0: ; %.entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, |v0|, v1, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +.entry: + %a.fabs = call float @llvm.fabs.f32(float %a) + %mul = fmul float %a.fabs, %b + %trunc = fptrunc float %mul to bfloat + ret bfloat %trunc +} + +define bfloat @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 { +; GFX1250-LABEL: mixlo_fptrunc_neg_src_mod: +; GFX1250: ; %bb.0: ; %.entry +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_fma_mixlo_bf16 v0, -v0, v1, 0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] +.entry: + %a.fneg = fneg float %a + %mul = fmul float %a.fneg, %b + %trunc = fptrunc float %mul to bfloat + ret bfloat %trunc +} + +declare float @llvm.fabs.f32(float) #1 + +declare bfloat @llvm.minnum.bf16(bfloat, bfloat) #1 +declare <2 x bfloat> @llvm.minnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1 +declare <3 x bfloat> @llvm.minnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1 +declare <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1 + +declare bfloat @llvm.maxnum.bf16(bfloat, bfloat) #1 +declare <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat>, <2 x bfloat>) #1 +declare <3 x bfloat> @llvm.maxnum.v3bf16(<3 x bfloat>, <3 x bfloat>) #1 +declare <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat>, <4 x bfloat>) #1 + +declare float @llvm.minnum.f32(float, float) #1 +declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1 + +declare float @llvm.maxnum.f32(float, float) #1 +declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1 + +declare float @llvm.fmuladd.f32(float, float, float) #1 +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 +declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1 +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s index 88346941bb2cd..a17fa674e6f6b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s @@ -1313,3 +1313,171 @@ v_pk_max3_num_f16 v1, v4, v9, v16 v_pk_max3_num_f16 v1, v2, v5, 1.0 // GFX1250: v_pk_max3_num_f16 v1, v2, v5, 1.0 ; encoding: [0x01,0x40,0x39,0xcc,0x02,0x0b,0xca,0x1b] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, v1, v2, s3 +// GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, v255, v255, s105 +// GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, s1, s2, v3 +// GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, s105, s105, m0 +// GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 +// GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| +// GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, v1, v2, s3 +// GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, v255, v255, s105 +// GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, s1, s2, v3 +// GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, s105, s105, m0 +// GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 +// GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| +// GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, v1, v2, s3 +// GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, v255, v255, s105 +// GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, s1, s2, v3 +// GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, s105, s105, m0 +// GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 +// GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 +// GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi +// GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| +// GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| +// GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] +// GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel:[0,0,0] op_sel_hi:[0,0,1] +// GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] +// GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] +// GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] op_sel_hi:[0,0,0] clamp +// GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt index d3ef89957c255..18246db41749d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt @@ -905,3 +905,129 @@ # GFX1250: v_pk_max3_num_f16 v8, v1, s1, v4 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04] 0x08,0x00,0x39,0xcc,0x01,0x03,0x10,0x04 + +# GFX1250: v_fma_mix_f32_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61] +0xff,0xa3,0x3d,0xcc,0xfd,0xd6,0xf0,0x61 + +# GFX1250: v_fma_mix_f32_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53] +0x05,0x0a,0x3d,0xcc,0xc1,0xfa,0x04,0x53 + +# GFX1250: v_fma_mix_f32_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21] +0x05,0x04,0x3d,0xcc,0x7d,0x82,0xa9,0x21 + +# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9] +0x05,0x7f,0x3d,0xcc,0x7f,0xfc,0xf8,0xf9 + +# GFX1250: v_fma_mix_f32_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3] +0x05,0x05,0x3d,0xcc,0x7e,0xf8,0xf4,0xa3 + +# GFX1250: v_fma_mix_f32_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9] +0x05,0x16,0x3d,0xcc,0xf0,0xd4,0xfc,0xc9 + +# GFX1250: v_fma_mix_f32_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03] +0x05,0x40,0x3d,0xcc,0x7c,0xfe,0xc0,0x03 + +# GFX1250: v_fma_mix_f32_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04] +0x05,0x00,0x3d,0xcc,0x01,0x04,0x0c,0x04 + +# GFX1250: v_fma_mix_f32_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01] +0x05,0x00,0x3d,0xcc,0x69,0xd2,0xf4,0x01 + +# GFX1250: v_fma_mix_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00] +0x05,0x00,0x3d,0xcc,0x01,0x05,0x0e,0x00 + +# GFX1250: v_fma_mix_f32_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01] +0x05,0x00,0x3d,0xcc,0xff,0xff,0xa7,0x01 + +# GFX1250: v_fma_mix_f32_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07] +0x05,0x00,0x3d,0xcc,0x6b,0xfa,0xfd,0x07 + +# GFX1250: v_fma_mix_f32_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01] +0x05,0x00,0x3d,0xcc,0x6a,0xf6,0xec,0x01 + +# GFX1250: v_fma_mix_f32_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81] +0x05,0x01,0x3d,0xcc,0x7b,0xe0,0xad,0x81 + +# GFX1250: v_fma_mixlo_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61] +0xff,0xa3,0x3e,0xcc,0xfd,0xd6,0xf0,0x61 + +# GFX1250: v_fma_mixlo_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53] +0x05,0x0a,0x3e,0xcc,0xc1,0xfa,0x04,0x53 + +# GFX1250: v_fma_mixlo_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21] +0x05,0x04,0x3e,0xcc,0x7d,0x82,0xa9,0x21 + +# GFX1250: v_fma_mixlo_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9] +0x05,0x7f,0x3e,0xcc,0x7f,0xfc,0xf8,0xf9 + +# GFX1250: v_fma_mixlo_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3] +0x05,0x05,0x3e,0xcc,0x7e,0xf8,0xf4,0xa3 + +# GFX1250: v_fma_mixlo_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9] +0x05,0x16,0x3e,0xcc,0xf0,0xd4,0xfc,0xc9 + +# GFX1250: v_fma_mixlo_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03] +0x05,0x40,0x3e,0xcc,0x7c,0xfe,0xc0,0x03 + +# GFX1250: v_fma_mixlo_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04] +0x05,0x00,0x3e,0xcc,0x01,0x04,0x0c,0x04 + +# GFX1250: v_fma_mixlo_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01] +0x05,0x00,0x3e,0xcc,0x69,0xd2,0xf4,0x01 + +# GFX1250: v_fma_mixlo_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00] +0x05,0x00,0x3e,0xcc,0x01,0x05,0x0e,0x00 + +# GFX1250: v_fma_mixlo_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01] +0x05,0x00,0x3e,0xcc,0xff,0xff,0xa7,0x01 + +# GFX1250: v_fma_mixlo_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07] +0x05,0x00,0x3e,0xcc,0x6b,0xfa,0xfd,0x07 + +# GFX1250: v_fma_mixlo_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01] +0x05,0x00,0x3e,0xcc,0x6a,0xf6,0xec,0x01 + +# GFX1250: v_fma_mixlo_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81] +0x05,0x01,0x3e,0xcc,0x7b,0xe0,0xad,0x81 + +# GFX1250: v_fma_mixhi_bf16 v255, -|src_scc|, -|vcc_hi|, null op_sel:[0,0,1] clamp ; encoding: [0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61] +0xff,0xa3,0x3f,0xcc,0xfd,0xd6,0xf0,0x61 + +# GFX1250: v_fma_mixhi_bf16 v5, -1, -|m0|, -1 op_sel:[1,0,0] op_sel_hi:[0,1,0] ; encoding: [0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53] +0x05,0x0a,0x3f,0xcc,0xc1,0xfa,0x04,0x53 + +# GFX1250: v_fma_mixhi_bf16 v5, -m0, -1, |vcc_lo| ; encoding: [0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21] +0x05,0x04,0x3f,0xcc,0x7d,0x82,0xa9,0x21 + +# GFX1250: v_fma_mixhi_bf16 v5, -|exec_hi|, -|exec_lo|, -|exec_lo| op_sel:[1,1,1] op_sel_hi:[1,1,1] ; encoding: [0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9] +0x05,0x7f,0x3f,0xcc,0x7f,0xfc,0xf8,0xf9 + +# GFX1250: v_fma_mixhi_bf16 v5, -|exec_lo|, null, -|src_scc| ; encoding: [0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3] +0x05,0x05,0x3f,0xcc,0x7e,0xf8,0xf4,0xa3 + +# GFX1250: v_fma_mixhi_bf16 v5, 0.5, -|vcc_lo|, -|exec_hi| op_sel:[0,1,0] op_sel_hi:[1,0,0] ; encoding: [0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9] +0x05,0x16,0x3f,0xcc,0xf0,0xd4,0xfc,0xc9 + +# GFX1250: v_fma_mixhi_bf16 v5, null, exec_hi, 0.5 op_sel_hi:[0,0,1] ; encoding: [0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03] +0x05,0x40,0x3f,0xcc,0x7c,0xfe,0xc0,0x03 + +# GFX1250: v_fma_mixhi_bf16 v5, s1, s2, v3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04] +0x05,0x00,0x3f,0xcc,0x01,0x04,0x0c,0x04 + +# GFX1250: v_fma_mixhi_bf16 v5, s105, s105, m0 ; encoding: [0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01] +0x05,0x00,0x3f,0xcc,0x69,0xd2,0xf4,0x01 + +# GFX1250: v_fma_mixhi_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00] +0x05,0x00,0x3f,0xcc,0x01,0x05,0x0e,0x00 + +# GFX1250: v_fma_mixhi_bf16 v5, v255, v255, s105 ; encoding: [0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01] +0x05,0x00,0x3f,0xcc,0xff,0xff,0xa7,0x01 + +# GFX1250: v_fma_mixhi_bf16 v5, vcc_hi, src_scc, v255 ; encoding: [0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07] +0x05,0x00,0x3f,0xcc,0x6b,0xfa,0xfd,0x07 + +# GFX1250: v_fma_mixhi_bf16 v5, vcc_lo, ttmp15, ttmp15 ; encoding: [0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01] +0x05,0x00,0x3f,0xcc,0x6a,0xf6,0xec,0x01 + +# GFX1250: v_fma_mixhi_bf16 v5, |ttmp15|, 0.5, -vcc_hi ; encoding: [0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81] +0x05,0x01,0x3f,0xcc,0x7b,0xe0,0xad,0x81