116 changes: 61 additions & 55 deletions llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64
def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64fcvtx_mt : SDNode<"AArch64ISD::FCVTX_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>;

Expand Down Expand Up @@ -635,6 +636,13 @@ let Predicates = [HasSVEorSME] in {
defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;

foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in {
def : Pat<(VT (fabs VT:$op)),
(AND_ZI $op, (i64 (logical_imm64_XFORM(i64 0x7fff7fff7fff7fff))))>;
def : Pat<(VT (fneg VT:$op)),
(EOR_ZI $op, (i64 (logical_imm64_XFORM(i64 0x8000800080008000))))>;
}

// zext(cmpeq(x, splat(0))) -> cnot(x)
def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))),
(CNOT_ZPmZ_B $Op2, $Pg, $Op2)>;
Expand Down Expand Up @@ -2320,7 +2328,12 @@ let Predicates = [HasSVEorSME] in {
def : Pat<(nxv2f16 (AArch64fcvtr_mt (nxv2i1 (SVEAllActive:$Pg)), nxv2f32:$Zs, (i64 timm0_1), nxv2f16:$Zd)),
(FCVT_ZPmZ_StoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;

// Signed integer -> Floating-point
def : Pat<(nxv4f32 (fpextend nxv4bf16:$op)),
(LSL_ZZI_S $op, (i32 16))>;
def : Pat<(nxv2f32 (fpextend nxv2bf16:$op)),
(LSL_ZZI_S $op, (i32 16))>;

// Signed integer -> Floating-point
def : Pat<(nxv2f16 (AArch64scvtf_mt (nxv2i1 (SVEAllActive):$Pg),
(sext_inreg nxv2i64:$Zs, nxv2i16), nxv2f16:$Zd)),
(SCVTF_ZPmZ_HtoH_UNDEF ZPR:$Zd, PPR:$Pg, ZPR:$Zs)>;
Expand Down Expand Up @@ -2392,7 +2405,7 @@ let Predicates = [HasBF16, HasSVEorSME] in {
defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>;
defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32, AArch64fcvtr_mt>;
defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
} // End HasBF16, HasSVEorSME

Expand Down Expand Up @@ -3746,7 +3759,7 @@ let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in {
let Predicates = [HasSVE2orSME] in {
// SVE2 floating-point convert precision
defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">;
defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx", AArch64fcvtx_mt>;
defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">;
defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">;

Expand Down Expand Up @@ -3892,7 +3905,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
//===----------------------------------------------------------------------===//

let Predicates = [HasSVE2p1_or_HasSME2] in {
defm FCLAMP_ZZZ : sve2p1_fclamp<"fclamp", AArch64fclamp>;
defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;

defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
Expand Down Expand Up @@ -4075,57 +4088,50 @@ def : InstAlias<"pfalse\t$Pd", (PFALSE PPRorPNR8:$Pd), 0>;
// Non-widening BFloat16 to BFloat16 instructions
//===----------------------------------------------------------------------===//

let Predicates = [HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos] in {
defm BFADD_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fadd>;
defm BFSUB_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fsub>;
defm BFMUL_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmul>;
defm BFMAXNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmaxnm>;
defm BFMINNM_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fminnm>;
defm BFMIN_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmin>;
defm BFMAX_ZPZZ : sve2p1_bf_2op_p_zds_zeroing<int_aarch64_sve_fmax>;
} // HasSVE2orSME2, HasSVEB16B16, UseExperimentalZeroingPseudos

let Predicates = [HasSVE2orSME2, HasSVEB16B16] in {

defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>;
defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bf<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>;

defm BFMLA_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmla_p>;
defm BFMLS_ZPZZZ : sve_fp_3op_pred_bf<AArch64fmls_p>;

defm BFMLA_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmla", 0b10, int_aarch64_sve_fmla_lane>;
defm BFMLS_ZZZI : sve2p1_fp_bfma_by_indexed_elem<"bfmls", 0b11, int_aarch64_sve_fmls_lane>;

defm BFADD_ZPmZZ : sve2p1_bf_2op_p_zds<0b0000, "bfadd", "BFADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
defm BFSUB_ZPmZZ : sve2p1_bf_2op_p_zds<0b0001, "bfsub", "BFSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryComm>;
defm BFMUL_ZPmZZ : sve2p1_bf_2op_p_zds<0b0010, "bfmul", "BFMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;

defm BFADD_ZZZ : sve2p1_bf_3op_u_zd<0b000, "bfadd", AArch64fadd>;
defm BFSUB_ZZZ : sve2p1_bf_3op_u_zd<0b001, "bfsub", AArch64fsub>;
defm BFMUL_ZZZ : sve2p1_bf_3op_u_zd<0b010, "bfmul", AArch64fmul>;

defm BFADD_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fadd_p>;
defm BFSUB_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fsub_p>;
defm BFMUL_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmul_p>;


defm BFMAX_ZPmZZ : sve2p1_bf_2op_p_zds<0b0110, "bfmax", "BFMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
defm BFMIN_ZPmZZ : sve2p1_bf_2op_p_zds<0b0111, "bfmin", "BFMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;

defm BFMAX_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmax_p>;
defm BFMIN_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmin_p>;


defm BFMAXNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0100, "bfmaxnm", "BFMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
defm BFMINNM_ZPmZZ : sve2p1_bf_2op_p_zds<0b0101, "bfminnm", "BFMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;

defm BFMAXNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fmaxnm_p>;
defm BFMINNM_ZPZZ : sve2p1_bf_bin_pred_zds<AArch64fminnm_p>;

defm BFMUL_ZZZI : sve2p1_fp_bfmul_by_indexed_elem<"bfmul", int_aarch64_sve_fmul_lane>;

defm BFCLAMP_ZZZ : sve2p1_bfclamp<"bfclamp", AArch64fclamp>;
} // End HasSVE2orSME2, HasSVEB16B16
let Predicates = [HasSVEB16B16] in {
defm BFADD_ZZZ : sve_fp_3op_u_zd_bfloat<0b000, "bfadd", AArch64fadd>;
defm BFSUB_ZZZ : sve_fp_3op_u_zd_bfloat<0b001, "bfsub", AArch64fsub>;
defm BFMUL_ZZZ : sve_fp_3op_u_zd_bfloat<0b010, "bfmul", AArch64fmul>;

defm BFADD_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0000, "bfadd", "BFADD_ZPZZ", AArch64fadd_m1, DestructiveBinaryComm>;
defm BFSUB_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0001, "bfsub", "BFSUB_ZPZZ", AArch64fsub_m1, DestructiveBinaryComm>;
defm BFMUL_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0010, "bfmul", "BFMUL_ZPZZ", AArch64fmul_m1, DestructiveBinaryComm>;
defm BFMAXNM_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0100, "bfmaxnm", "BFMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
defm BFMINNM_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0101, "bfminnm", "BFMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
defm BFMAX_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0110, "bfmax", "BFMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
defm BFMIN_ZPmZZ : sve_fp_2op_p_zds_bfloat<0b0111, "bfmin", "BFMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;

defm BFADD_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fadd_p>;
defm BFSUB_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fsub_p>;
defm BFMUL_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fmul_p>;
defm BFMAXNM_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fmaxnm_p>;
defm BFMINNM_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fminnm_p>;
defm BFMAX_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fmax_p>;
defm BFMIN_ZPZZ : sve_fp_bin_pred_bfloat<AArch64fmin_p>;

defm BFMLA_ZPmZZ : sve_fp_3op_p_zds_a_bfloat<0b00, "bfmla", "BFMLA_ZPZZZ", AArch64fmla_m1>;
defm BFMLS_ZPmZZ : sve_fp_3op_p_zds_a_bfloat<0b01, "bfmls", "BFMLS_ZPZZZ", AArch64fmls_m1>;

defm BFMLA_ZPZZZ : sve_fp_3op_pred_bfloat<AArch64fmla_p>;
defm BFMLS_ZPZZZ : sve_fp_3op_pred_bfloat<AArch64fmls_p>;

defm BFMLA_ZZZI : sve_fp_fma_by_indexed_elem_bfloat<"bfmla", 0b10, int_aarch64_sve_fmla_lane>;
defm BFMLS_ZZZI : sve_fp_fma_by_indexed_elem_bfloat<"bfmls", 0b11, int_aarch64_sve_fmls_lane>;

defm BFMUL_ZZZI : sve_fp_fmul_by_indexed_elem_bfloat<"bfmul", int_aarch64_sve_fmul_lane>;

defm BFCLAMP_ZZZ : sve_fp_clamp_bfloat<"bfclamp", AArch64fclamp>;
} // End HasSVEB16B16

let Predicates = [HasSVEB16B16, UseExperimentalZeroingPseudos] in {
defm BFADD_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fadd>;
defm BFSUB_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fsub>;
defm BFMUL_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fmul>;
defm BFMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fmaxnm>;
defm BFMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fminnm>;
defm BFMIN_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fmin>;
defm BFMAX_ZPZZ : sve_fp_2op_p_zds_zeroing_bfloat<int_aarch64_sve_fmax>;
} // HasSVEB16B16, UseExperimentalZeroingPseudos


//===----------------------------------------------------------------------===//
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64SchedNeoverseN2.td
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def NeoverseN2Model : SchedMachineModel {
let CompleteModel = 1;

list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
[HasSVE2p1, HasPAuthLR, HasCPA, HasCSSC]);
[HasSVE2p1, HasSVEB16B16, HasPAuthLR, HasCPA, HasCSSC]);
}

//===----------------------------------------------------------------------===//
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def NeoverseV2Model : SchedMachineModel {
let CompleteModel = 1;

list<Predicate> UnsupportedFeatures = !listconcat(SMEUnsupported.F,
[HasSVE2p1, HasCPA,
HasCSSC]);
[HasSVE2p1, HasSVEB16B16,
HasCPA, HasCSSC]);
}

//===----------------------------------------------------------------------===//
Expand Down
12 changes: 9 additions & 3 deletions llvm/lib/Target/AArch64/AArch64Subtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,16 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
(hasSMEFA64() || (!isStreaming() && !isStreamingCompatible()));
}

/// Returns true if the target has access to either the full range of SVE instructions,
/// or the streaming-compatible subset of SVE instructions.
/// Returns true if the target has access to the streaming-compatible subset
/// of SVE instructions.
bool isStreamingSVEAvailable() const {
return hasSME() && isStreaming();
}

/// Returns true if the target has access to either the full range of SVE
/// instructions, or the streaming-compatible subset of SVE instructions.
bool isSVEorStreamingSVEAvailable() const {
return hasSVE() || (hasSME() && isStreaming());
return hasSVE() || isStreamingSVEAvailable();
}

unsigned getMinVectorRegisterBitWidth() const {
Expand Down
139 changes: 78 additions & 61 deletions llvm/lib/Target/AArch64/SVEInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -2134,29 +2134,6 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
let mayRaiseFPException = 1;
}

multiclass sve2p1_bf_2op_p_zds<bits<4> opc, string asm, string Ps,
SDPatternOperator op, DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def NAME : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>,
SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME , revname , isReverseInstr>;
}

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
}

multiclass sve2p1_bf_bin_pred_zds<SDPatternOperator op> {
def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
}

multiclass sve2p1_bf_2op_p_zds_zeroing<SDPatternOperator op> {
def _ZERO : PredTwoOpPseudo<NAME, ZPR16, FalseLanesZero>;

def : SVE_3_Op_Pat_SelZero<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _ZERO)>;
}

multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
SDPatternOperator op, DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
Expand Down Expand Up @@ -2185,6 +2162,18 @@ multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
}

multiclass sve_fp_2op_p_zds_bfloat<bits<4> opc, string asm, string Ps,
SDPatternOperator op,
DestructiveInstTypeEnum flags,
string revname="", bit isReverseInstr=0> {
let DestructiveInstType = flags in {
def NAME : sve_fp_2op_p_zds<0b00, opc, asm, ZPR16>,
SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME , revname , isReverseInstr>;
}

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
}

multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
def _H_ZERO : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
def _S_ZERO : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
Expand All @@ -2195,6 +2184,12 @@ multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _D_ZERO)>;
}

multiclass sve_fp_2op_p_zds_zeroing_bfloat<SDPatternOperator op> {
def _ZERO : PredTwoOpPseudo<NAME, ZPR16, FalseLanesZero>;

def : SVE_3_Op_Pat_SelZero<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _ZERO)>;
}

class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
: I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, timm32_0_7:$imm3),
asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
Expand Down Expand Up @@ -2300,10 +2295,12 @@ multiclass sve_fp_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}

multiclass sve2p1_bf_3op_u_zd<bits<3> opc, string asm, SDPatternOperator op> {
multiclass sve_fp_3op_u_zd_bfloat<bits<3> opc, string asm, SDPatternOperator op> {
def NAME : sve_fp_3op_u_zd<0b00, opc, asm, ZPR16>;

def : SVE_2_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv4bf16, op, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME)>;
def : SVE_2_Op_Pat<nxv2bf16, op, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME)>;
}

multiclass sve_fp_3op_u_zd_ftsmul<bits<3> opc, string asm, SDPatternOperator op> {
Expand Down Expand Up @@ -2364,8 +2361,8 @@ multiclass sve_fp_3op_p_zds_a<bits<2> opc, string asm, string Ps,
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}

multiclass sve_fp_3op_p_zds_a_bf<bits<2> opc, string asm, string Ps,
SDPatternOperator op> {
multiclass sve_fp_3op_p_zds_a_bfloat<bits<2> opc, string asm, string Ps,
SDPatternOperator op> {
def NAME : sve_fp_3op_p_zds_a<0b00, opc, asm, ZPR16>,
SVEPseudo2Instr<Ps, 1>, SVEInstr2Rev<NAME, "", 0>;

Expand Down Expand Up @@ -2439,19 +2436,6 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bits<2> opc, string asm,
let mayRaiseFPException = 1;
}

multiclass sve2p1_fp_bfma_by_indexed_elem<string asm, bits<2> opc, SDPatternOperator op> {
def NAME : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16,
VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def : Pat<(nxv8bf16 (op nxv8bf16:$op1, nxv8bf16:$op2, nxv8bf16:$op3, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME) $op1, $op2, $op3, VectorIndexH32b_timm:$idx)>;
}

multiclass sve_fp_fma_by_indexed_elem<bits<2> opc, string asm,
SDPatternOperator op> {
def _H : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
Expand Down Expand Up @@ -2482,6 +2466,19 @@ multiclass sve_fp_fma_by_indexed_elem<bits<2> opc, string asm,
(!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
}

multiclass sve_fp_fma_by_indexed_elem_bfloat<string asm, bits<2> opc,
SDPatternOperator op> {
def NAME : sve_fp_fma_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}

def : Pat<(nxv8bf16 (op nxv8bf16:$op1, nxv8bf16:$op2, nxv8bf16:$op3, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME) $op1, $op2, $op3, VectorIndexH32b_timm:$idx)>;
}

//===----------------------------------------------------------------------===//
// SVE Floating Point Multiply - Indexed Group
Expand All @@ -2506,18 +2503,6 @@ class sve_fp_fmul_by_indexed_elem<bits<2> sz, bit o2, string asm, ZPRRegOp zprty
let mayRaiseFPException = 1;
}

multiclass sve2p1_fp_bfmul_by_indexed_elem<string asm, SDPatternOperator ir_intrinsic> {
def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def : Pat <(nxv8bf16 (ir_intrinsic nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
}

multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
def _H : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b0, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
Expand Down Expand Up @@ -2547,6 +2532,19 @@ multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
(!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>;
}

multiclass sve_fp_fmul_by_indexed_elem_bfloat<string asm,
SDPatternOperator op> {
def NAME : sve_fp_fmul_by_indexed_elem<{0, ?}, 0b1, asm, ZPR16, ZPR3b16, VectorIndexH32b> {
bits<3> Zm;
bits<3> iop;
let Inst{22} = iop{2};
let Inst{20-19} = iop{1-0};
let Inst{18-16} = Zm;
}
def : Pat <(nxv8bf16 (op nxv8bf16:$Op1, nxv8bf16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
(!cast<Instruction>(NAME) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
}

//===----------------------------------------------------------------------===//
// SVE Floating Point Complex Multiply-Add Group
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -3063,9 +3061,11 @@ multiclass sve2_fp_un_pred_zeroing_hsd<SDPatternOperator op> {
def : SVE_1_Op_PassthruZero_Pat<nxv2i64, op, nxv2i1, nxv2f64, !cast<Pseudo>(NAME # _D_ZERO)>;
}

multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
multiclass sve2_fp_convert_down_odd_rounding<string asm, string op, SDPatternOperator ir_op = null_frag> {
def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;

def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
def : SVE_1_Op_Passthru_Pat<nxv2f32, ir_op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -8811,9 +8811,13 @@ class sve_bfloat_convert<bit N, string asm>
let mayRaiseFPException = 1;
}

multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op,
SDPatternOperator ir_op = null_frag> {
def NAME : sve_bfloat_convert<N, asm>;

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -9073,6 +9077,15 @@ multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _D_UNDEF)>;
}

// Predicated pseudo floating point (BFloat) two operand instructions.
multiclass sve_fp_bin_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredTwoOpPseudo<NAME, ZPR16, FalseLanesUndef>;

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, !cast<Pseudo>(NAME # _UNDEF)>;
def : SVE_3_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, !cast<Pseudo>(NAME # _UNDEF)>;
def : SVE_3_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, !cast<Pseudo>(NAME # _UNDEF)>;
}

// Predicated pseudo floating point three operand instructions.
multiclass sve_fp_3op_pred_hfd<SDPatternOperator op> {
def _H_UNDEF : PredThreeOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
Expand All @@ -9087,10 +9100,13 @@ multiclass sve_fp_3op_pred_hfd<SDPatternOperator op> {
def : SVE_4_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D_UNDEF)>;
}

multiclass sve_fp_3op_pred_bf<SDPatternOperator op> {
// Predicated pseudo floating point (BFloat) three operand instructions.
multiclass sve_fp_3op_pred_bfloat<SDPatternOperator op> {
def _UNDEF : PredThreeOpPseudo<NAME, ZPR16, FalseLanesUndef>;

def : SVE_4_Op_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME # _UNDEF)>;
def : SVE_4_Op_Pat<nxv4bf16, op, nxv4i1, nxv4bf16, nxv4bf16, nxv4bf16, !cast<Instruction>(NAME # _UNDEF)>;
def : SVE_4_Op_Pat<nxv2bf16, op, nxv2i1, nxv2bf16, nxv2bf16, nxv2bf16, !cast<Instruction>(NAME # _UNDEF)>;
}

// Predicated pseudo integer two operand instructions.
Expand Down Expand Up @@ -9147,7 +9163,7 @@ multiclass sve_int_bin_pred_all_active_bhsd<SDPatternOperator op> {
// SME2 or SVE2.1 Instructions
//===----------------------------------------------------------------------===//

class sve2p1_fclamp<string asm, bits<2> sz, ZPRRegOp zpr_ty>
class sve_fp_clamp<string asm, bits<2> sz, ZPRRegOp zpr_ty>
: I<(outs zpr_ty:$Zd), (ins zpr_ty:$_Zd, zpr_ty:$Zn, zpr_ty:$Zm),
asm, "\t$Zd, $Zn, $Zm", "", []>,
Sched<[]> {
Expand All @@ -9168,18 +9184,19 @@ class sve2p1_fclamp<string asm, bits<2> sz, ZPRRegOp zpr_ty>
let hasSideEffects = 0;
}

multiclass sve2p1_fclamp<string asm, SDPatternOperator op> {
def _H : sve2p1_fclamp<asm, 0b01, ZPR16>;
def _S : sve2p1_fclamp<asm, 0b10, ZPR32>;
def _D : sve2p1_fclamp<asm, 0b11, ZPR64>;
multiclass sve_fp_clamp<string asm, SDPatternOperator op> {
def _H : sve_fp_clamp<asm, 0b01, ZPR16>;
def _S : sve_fp_clamp<asm, 0b10, ZPR32>;
def _D : sve_fp_clamp<asm, 0b11, ZPR64>;

def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
}

multiclass sve2p1_bfclamp<string asm, SDPatternOperator op> {
def NAME : sve2p1_fclamp<asm, 0b00, ZPR16>;
multiclass sve_fp_clamp_bfloat<string asm, SDPatternOperator op> {
def NAME : sve_fp_clamp<asm, 0b00, ZPR16>;

def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8bf16, nxv8bf16, !cast<Instruction>(NAME)>;
}

Expand Down
752 changes: 752 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-bf16-arith.ll

Large diffs are not rendered by default.

214 changes: 214 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sve < %s | FileCheck %s
; RUN: llc -mattr=+sve --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NOBF16NNAN
; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,BF16
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,BF16

target triple = "aarch64-unknown-linux-gnu"

; NOTE: "fptrunc <# x double> to <# x bfloat>" is not supported because SVE
; lacks a down convert that rounds to odd. Such IR will trigger the usual
; failure (crash) when attempting to unroll a scalable vector.

define <vscale x 2 x float> @fpext_nxv2bf16_to_nxv2f32(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ret
%res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x float>
ret <vscale x 2 x float> %res
}

define <vscale x 4 x float> @fpext_nxv4bf16_to_nxv4f32(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f32:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ret
%res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x float>
ret <vscale x 4 x float> %res
}

define <vscale x 8 x float> @fpext_nxv8bf16_to_nxv8f32(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f32:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpklo z1.s, z0.h
; CHECK-NEXT: uunpkhi z2.s, z0.h
; CHECK-NEXT: lsl z0.s, z1.s, #16
; CHECK-NEXT: lsl z1.s, z2.s, #16
; CHECK-NEXT: ret
%res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x float>
ret <vscale x 8 x float> %res
}

define <vscale x 2 x double> @fpext_nxv2bf16_to_nxv2f64(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fcvt z0.d, p0/m, z0.s
; CHECK-NEXT: ret
%res = fpext <vscale x 2 x bfloat> %a to <vscale x 2 x double>
ret <vscale x 2 x double> %res
}

define <vscale x 4 x double> @fpext_nxv4bf16_to_nxv4f64(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv4bf16_to_nxv4f64:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpklo z1.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z2.s, z0.s, #16
; CHECK-NEXT: movprfx z0, z1
; CHECK-NEXT: fcvt z0.d, p0/m, z1.s
; CHECK-NEXT: movprfx z1, z2
; CHECK-NEXT: fcvt z1.d, p0/m, z2.s
; CHECK-NEXT: ret
%res = fpext <vscale x 4 x bfloat> %a to <vscale x 4 x double>
ret <vscale x 4 x double> %res
}

define <vscale x 8 x double> @fpext_nxv8bf16_to_nxv8f64(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: fpext_nxv8bf16_to_nxv8f64:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpklo z1.s, z0.h
; CHECK-NEXT: uunpkhi z0.s, z0.h
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: uunpklo z2.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z2.s, z2.s, #16
; CHECK-NEXT: lsl z3.s, z3.s, #16
; CHECK-NEXT: lsl z4.s, z0.s, #16
; CHECK-NEXT: fcvt z1.d, p0/m, z1.s
; CHECK-NEXT: movprfx z0, z2
; CHECK-NEXT: fcvt z0.d, p0/m, z2.s
; CHECK-NEXT: movprfx z2, z3
; CHECK-NEXT: fcvt z2.d, p0/m, z3.s
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: fcvt z3.d, p0/m, z4.s
; CHECK-NEXT: ret
%res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x double>
ret <vscale x 8 x double> %res
}

define <vscale x 2 x bfloat> @fptrunc_nxv2f32_to_nxv2bf16(<vscale x 2 x float> %a) {
; NOBF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff
; NOBF16-NEXT: lsr z2.s, z0.s, #16
; NOBF16-NEXT: ptrue p0.d
; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; NOBF16-NEXT: and z2.s, z2.s, #0x1
; NOBF16-NEXT: add z1.s, z0.s, z1.s
; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
; NOBF16-NEXT: add z1.s, z2.s, z1.s
; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: ret
;
; NOBF16NNAN-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
; NOBF16NNAN: // %bb.0:
; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff
; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16
; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1
; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s
; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s
; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
; NOBF16NNAN-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.d
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: ret
%res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x bfloat>
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @fptrunc_nxv4f32_to_nxv4bf16(<vscale x 4 x float> %a) {
; NOBF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff
; NOBF16-NEXT: lsr z2.s, z0.s, #16
; NOBF16-NEXT: ptrue p0.s
; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; NOBF16-NEXT: and z2.s, z2.s, #0x1
; NOBF16-NEXT: add z1.s, z0.s, z1.s
; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
; NOBF16-NEXT: add z1.s, z2.s, z1.s
; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: ret
;
; NOBF16NNAN-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
; NOBF16NNAN: // %bb.0:
; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff
; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16
; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1
; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s
; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s
; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
; NOBF16NNAN-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.s
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: ret
%res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x bfloat>
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %a) {
; NOBF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: mov z2.s, #32767 // =0x7fff
; NOBF16-NEXT: lsr z3.s, z1.s, #16
; NOBF16-NEXT: lsr z4.s, z0.s, #16
; NOBF16-NEXT: ptrue p0.s
; NOBF16-NEXT: and z3.s, z3.s, #0x1
; NOBF16-NEXT: and z4.s, z4.s, #0x1
; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s
; NOBF16-NEXT: add z5.s, z1.s, z2.s
; NOBF16-NEXT: add z2.s, z0.s, z2.s
; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s
; NOBF16-NEXT: orr z1.s, z1.s, #0x400000
; NOBF16-NEXT: orr z0.s, z0.s, #0x400000
; NOBF16-NEXT: add z3.s, z3.s, z5.s
; NOBF16-NEXT: add z2.s, z4.s, z2.s
; NOBF16-NEXT: sel z1.s, p1, z1.s, z3.s
; NOBF16-NEXT: sel z0.s, p0, z0.s, z2.s
; NOBF16-NEXT: lsr z1.s, z1.s, #16
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: uzp1 z0.h, z0.h, z1.h
; NOBF16-NEXT: ret
;
; NOBF16NNAN-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
; NOBF16NNAN: // %bb.0:
; NOBF16NNAN-NEXT: mov z2.s, #32767 // =0x7fff
; NOBF16NNAN-NEXT: lsr z3.s, z1.s, #16
; NOBF16NNAN-NEXT: lsr z4.s, z0.s, #16
; NOBF16NNAN-NEXT: and z3.s, z3.s, #0x1
; NOBF16NNAN-NEXT: and z4.s, z4.s, #0x1
; NOBF16NNAN-NEXT: add z1.s, z1.s, z2.s
; NOBF16NNAN-NEXT: add z0.s, z0.s, z2.s
; NOBF16NNAN-NEXT: add z1.s, z3.s, z1.s
; NOBF16NNAN-NEXT: add z0.s, z4.s, z0.s
; NOBF16NNAN-NEXT: lsr z1.s, z1.s, #16
; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16
; NOBF16NNAN-NEXT: uzp1 z0.h, z0.h, z1.h
; NOBF16NNAN-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.s
; BF16-NEXT: bfcvt z1.h, p0/m, z1.s
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: uzp1 z0.h, z0.h, z1.h
; BF16-NEXT: ret
%res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x bfloat>
ret <vscale x 8 x bfloat> %res
}
355 changes: 355 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub
; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s

target triple = "aarch64-unknown-linux-gnu"

;
; FCEIL
;

define <vscale x 2 x bfloat> @frintp_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frintp_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frintp z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frintp_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frintp_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frintp z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frintp_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frintp_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frintp z1.s, p0/m, z1.s
; CHECK-NEXT: frintp z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; FFLOOR
;

define <vscale x 2 x bfloat> @frintm_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frintm_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frintm z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.floor.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frintm_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frintm_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frintm z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.floor.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frintm_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frintm_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frintm z1.s, p0/m, z1.s
; CHECK-NEXT: frintm z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.floor.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; FNEARBYINT
;

define <vscale x 2 x bfloat> @frinti_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frinti_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frinti z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frinti_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frinti_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frinti z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frinti_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frinti_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frinti z1.s, p0/m, z1.s
; CHECK-NEXT: frinti z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; FRINT
;

define <vscale x 2 x bfloat> @frintx_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frintx_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frintx z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.rint.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frintx_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frintx_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frintx z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.rint.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frintx_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frintx_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frintx z1.s, p0/m, z1.s
; CHECK-NEXT: frintx z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.rint.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; ROUND
;

define <vscale x 2 x bfloat> @frinta_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frinta_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frinta z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.round.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frinta_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frinta_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frinta z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.round.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frinta_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frinta_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frinta z1.s, p0/m, z1.s
; CHECK-NEXT: frinta z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.round.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; ROUNDEVEN
;

define <vscale x 2 x bfloat> @frintn_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frintn_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frintn z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frintn_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frintn_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frintn z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frintn_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frintn_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frintn z1.s, p0/m, z1.s
; CHECK-NEXT: frintn z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

;
; FTRUNC
;

define <vscale x 2 x bfloat> @frintz_nxv2bf16(<vscale x 2 x bfloat> %a) {
; CHECK-LABEL: frintz_nxv2bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: frintz z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16(<vscale x 2 x bfloat> %a)
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @frintz_nxv4bf16(<vscale x 4 x bfloat> %a) {
; CHECK-LABEL: frintz_nxv4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: frintz z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: ret
%res = call <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16(<vscale x 4 x bfloat> %a)
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @frintz_nxv8bf16(<vscale x 8 x bfloat> %a) {
; CHECK-LABEL: frintz_nxv8bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: uunpkhi z1.s, z0.h
; CHECK-NEXT: uunpklo z0.s, z0.h
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: lsl z1.s, z1.s, #16
; CHECK-NEXT: lsl z0.s, z0.s, #16
; CHECK-NEXT: frintz z1.s, p0/m, z1.s
; CHECK-NEXT: frintz z0.s, p0/m, z0.s
; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s
; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s
; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
; CHECK-NEXT: ret
%res = call <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16(<vscale x 8 x bfloat> %a)
ret <vscale x 8 x bfloat> %res
}

declare <vscale x 2 x bfloat> @llvm.ceil.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.ceil.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.ceil.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.floor.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.floor.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.floor.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.nearbyint.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.nearbyint.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.nearbyint.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.rint.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.rint.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.rint.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.round.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.round.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.round.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.roundeven.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.roundeven.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.roundeven.nxv8bf16( <vscale x 8 x bfloat>)

declare <vscale x 2 x bfloat> @llvm.trunc.nxv2bf16( <vscale x 2 x bfloat>)
declare <vscale x 4 x bfloat> @llvm.trunc.nxv4bf16( <vscale x 4 x bfloat>)
declare <vscale x 8 x bfloat> @llvm.trunc.nxv8bf16( <vscale x 8 x bfloat>)
115 changes: 115 additions & 0 deletions llvm/test/CodeGen/AArch64/sve2-bf16-converts.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=NOBF16
; RUN: llc -mattr=+sve2,+bf16 < %s | FileCheck %s --check-prefixes=BF16
; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=BF16

target triple = "aarch64-unknown-linux-gnu"

define <vscale x 2 x bfloat> @fptrunc_nxv2f64_to_nxv2bf16(<vscale x 2 x double> %a) {
; NOBF16-LABEL: fptrunc_nxv2f64_to_nxv2bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: ptrue p0.d
; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff
; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d
; NOBF16-NEXT: lsr z2.s, z0.s, #16
; NOBF16-NEXT: add z0.s, z0.s, z1.s
; NOBF16-NEXT: and z2.s, z2.s, #0x1
; NOBF16-NEXT: add z0.s, z2.s, z0.s
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv2f64_to_nxv2bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.d
; BF16-NEXT: fcvtx z0.s, p0/m, z0.d
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: ret
%res = fptrunc <vscale x 2 x double> %a to <vscale x 2 x bfloat>
ret <vscale x 2 x bfloat> %res
}

define <vscale x 4 x bfloat> @fptrunc_nxv4f64_to_nxv4bf16(<vscale x 4 x double> %a) {
; NOBF16-LABEL: fptrunc_nxv4f64_to_nxv4bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: ptrue p0.d
; NOBF16-NEXT: mov z2.s, #32767 // =0x7fff
; NOBF16-NEXT: fcvtx z1.s, p0/m, z1.d
; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d
; NOBF16-NEXT: lsr z3.s, z1.s, #16
; NOBF16-NEXT: lsr z4.s, z0.s, #16
; NOBF16-NEXT: add z1.s, z1.s, z2.s
; NOBF16-NEXT: add z0.s, z0.s, z2.s
; NOBF16-NEXT: and z3.s, z3.s, #0x1
; NOBF16-NEXT: and z4.s, z4.s, #0x1
; NOBF16-NEXT: add z1.s, z3.s, z1.s
; NOBF16-NEXT: add z0.s, z4.s, z0.s
; NOBF16-NEXT: lsr z1.s, z1.s, #16
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: uzp1 z0.s, z0.s, z1.s
; NOBF16-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv4f64_to_nxv4bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.d
; BF16-NEXT: fcvtx z1.s, p0/m, z1.d
; BF16-NEXT: fcvtx z0.s, p0/m, z0.d
; BF16-NEXT: bfcvt z1.h, p0/m, z1.s
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: uzp1 z0.s, z0.s, z1.s
; BF16-NEXT: ret
%res = fptrunc <vscale x 4 x double> %a to <vscale x 4 x bfloat>
ret <vscale x 4 x bfloat> %res
}

define <vscale x 8 x bfloat> @fptrunc_nxv8f64_to_nxv8bf16(<vscale x 8 x double> %a) {
; NOBF16-LABEL: fptrunc_nxv8f64_to_nxv8bf16:
; NOBF16: // %bb.0:
; NOBF16-NEXT: ptrue p0.d
; NOBF16-NEXT: mov z4.s, #32767 // =0x7fff
; NOBF16-NEXT: fcvtx z3.s, p0/m, z3.d
; NOBF16-NEXT: fcvtx z2.s, p0/m, z2.d
; NOBF16-NEXT: fcvtx z1.s, p0/m, z1.d
; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d
; NOBF16-NEXT: lsr z5.s, z3.s, #16
; NOBF16-NEXT: lsr z6.s, z2.s, #16
; NOBF16-NEXT: lsr z7.s, z1.s, #16
; NOBF16-NEXT: lsr z24.s, z0.s, #16
; NOBF16-NEXT: add z3.s, z3.s, z4.s
; NOBF16-NEXT: add z2.s, z2.s, z4.s
; NOBF16-NEXT: add z1.s, z1.s, z4.s
; NOBF16-NEXT: add z0.s, z0.s, z4.s
; NOBF16-NEXT: and z5.s, z5.s, #0x1
; NOBF16-NEXT: and z6.s, z6.s, #0x1
; NOBF16-NEXT: and z7.s, z7.s, #0x1
; NOBF16-NEXT: and z24.s, z24.s, #0x1
; NOBF16-NEXT: add z3.s, z5.s, z3.s
; NOBF16-NEXT: add z2.s, z6.s, z2.s
; NOBF16-NEXT: add z1.s, z7.s, z1.s
; NOBF16-NEXT: add z0.s, z24.s, z0.s
; NOBF16-NEXT: lsr z3.s, z3.s, #16
; NOBF16-NEXT: lsr z2.s, z2.s, #16
; NOBF16-NEXT: lsr z1.s, z1.s, #16
; NOBF16-NEXT: lsr z0.s, z0.s, #16
; NOBF16-NEXT: uzp1 z2.s, z2.s, z3.s
; NOBF16-NEXT: uzp1 z0.s, z0.s, z1.s
; NOBF16-NEXT: uzp1 z0.h, z0.h, z2.h
; NOBF16-NEXT: ret
;
; BF16-LABEL: fptrunc_nxv8f64_to_nxv8bf16:
; BF16: // %bb.0:
; BF16-NEXT: ptrue p0.d
; BF16-NEXT: fcvtx z3.s, p0/m, z3.d
; BF16-NEXT: fcvtx z2.s, p0/m, z2.d
; BF16-NEXT: fcvtx z1.s, p0/m, z1.d
; BF16-NEXT: fcvtx z0.s, p0/m, z0.d
; BF16-NEXT: bfcvt z3.h, p0/m, z3.s
; BF16-NEXT: bfcvt z2.h, p0/m, z2.s
; BF16-NEXT: bfcvt z1.h, p0/m, z1.s
; BF16-NEXT: bfcvt z0.h, p0/m, z0.s
; BF16-NEXT: uzp1 z2.s, z2.s, z3.s
; BF16-NEXT: uzp1 z0.s, z0.s, z1.s
; BF16-NEXT: uzp1 z0.h, z0.h, z2.h
; BF16-NEXT: ret
%res = fptrunc <vscale x 8 x double> %a to <vscale x 8 x bfloat>
ret <vscale x 8 x bfloat> %res
}