98 changes: 89 additions & 9 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->useRealTrue16Insts()) {
addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
} else {
addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
}

// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
Expand Down Expand Up @@ -196,6 +199,41 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::i1, MVT::v32i32},
Custom);

if (isTypeLegal(MVT::bf16)) {
for (unsigned Opc :
{ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
ISD::FREM, ISD::FMA, ISD::FMINNUM, ISD::FMAXNUM,
ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FSQRT, ISD::FCBRT,
ISD::FSIN, ISD::FCOS, ISD::FPOW, ISD::FPOWI,
ISD::FLDEXP, ISD::FFREXP, ISD::FLOG, ISD::FLOG2,
ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10,
ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FNEARBYINT,
ISD::FROUND, ISD::FROUNDEVEN, ISD::FFLOOR, ISD::FCANONICALIZE,
ISD::SETCC}) {
// FIXME: The promoted to type shouldn't need to be explicit
setOperationAction(Opc, MVT::bf16, Promote);
AddPromotedToType(Opc, MVT::bf16, MVT::f32);
}

setOperationAction(ISD::FP_ROUND, MVT::bf16, Expand);

setOperationAction(ISD::SELECT, MVT::bf16, Promote);
AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);

// TODO: Could make these legal
setOperationAction(ISD::FABS, MVT::bf16, Expand);
setOperationAction(ISD::FNEG, MVT::bf16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Expand);

// We only need to custom lower because we can't specify an action for bf16
// sources.
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16, Promote);
AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
}

setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
Expand Down Expand Up @@ -388,8 +426,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// Avoid stack access for these.
// TODO: Generalize to more vector types.
setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
{MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
MVT::v4i16, MVT::v4f16},
{MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
MVT::v8i8, MVT::v4i16, MVT::v4f16},
Custom);

// Deal with vec3 vector operations when widened to vec4.
Expand Down Expand Up @@ -498,6 +536,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);

// Custom lower these because we can't specify a rule based on an illegal
// source bf16.
setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f32, Custom);
setOperationAction({ISD::FP_EXTEND, ISD::STRICT_FP_EXTEND}, MVT::f64, Custom);

if (Subtarget->has16BitInsts()) {
setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
Expand All @@ -524,26 +567,38 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);

setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);

setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);

// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);

// F16 - Load/Store Actions.
setOperationAction(ISD::LOAD, MVT::f16, Promote);
AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
setOperationAction(ISD::STORE, MVT::f16, Promote);
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);

// BF16 - Load/Store Actions.
setOperationAction(ISD::LOAD, MVT::bf16, Promote);
AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
setOperationAction(ISD::STORE, MVT::bf16, Promote);
AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);

// F16 - VOP1 Actions.
setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
MVT::f16, Custom);

setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);

// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
Expand);
setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
setOperationAction(ISD::FFREXP, MVT::f16, Custom);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
Expand All @@ -554,8 +609,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMAD, MVT::f16, Legal);

for (MVT VT :
{MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
{MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
Expand Down Expand Up @@ -587,7 +643,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// XXX - Do these do anything? Vector constants turn into build_vector.
setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);

setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Legal);

setOperationAction(ISD::STORE, MVT::v2i16, Promote);
AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
Expand Down Expand Up @@ -699,7 +756,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
MVT::v2f16, Legal);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);

setOperationAction(ISD::VECTOR_SHUFFLE,
Expand Down Expand Up @@ -3902,6 +3959,26 @@ SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
return Op;
}

// Work around DAG legality rules only based on the result type.
SDValue SITargetLowering::lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();

if (SrcVT.getScalarType() != MVT::bf16)
return Op;

SDLoc SL(Op);
SDValue BitCast =
DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);

EVT DstVT = Op.getValueType();
if (IsStrict)
llvm_unreachable("Need STRICT_BF16_TO_FP");

return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
}

Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
Expand Down Expand Up @@ -5452,6 +5529,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerGET_ROUNDING(Op, DAG);
case ISD::PREFETCH:
return lowerPREFETCH(Op, DAG);
case ISD::FP_EXTEND:
case ISD::STRICT_FP_EXTEND:
return lowerFP_EXTEND(Op, DAG);
}
return SDValue();
}
Expand Down Expand Up @@ -6639,7 +6719,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);

if (ResultVT == MVT::f16) {
if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;

SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;

Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
Expand Down
42 changes: 36 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1122,7 +1122,7 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
>;

def : GCNPat <
(f64 (fpextend f16:$src)),
(f64 (any_fpextend f16:$src)),
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
>;

Expand Down Expand Up @@ -1515,6 +1515,23 @@ def : BitConvert <v2f16, f32, SReg_32>;
def : BitConvert <f32, v2f16, SReg_32>;
def : BitConvert <v2i16, f32, SReg_32>;
def : BitConvert <f32, v2i16, SReg_32>;
def : BitConvert <v2bf16, i32, SReg_32>;
def : BitConvert <i32, v2bf16, SReg_32>;
def : BitConvert <v2bf16, i32, VGPR_32>;
def : BitConvert <i32, v2bf16, VGPR_32>;
def : BitConvert <v2bf16, v2i16, SReg_32>;
def : BitConvert <v2i16, v2bf16, SReg_32>;
def : BitConvert <v2bf16, v2i16, VGPR_32>;
def : BitConvert <v2i16, v2bf16, VGPR_32>;
def : BitConvert <v2bf16, v2f16, SReg_32>;
def : BitConvert <v2f16, v2bf16, SReg_32>;
def : BitConvert <v2bf16, v2f16, VGPR_32>;
def : BitConvert <v2f16, v2bf16, VGPR_32>;
def : BitConvert <f32, v2bf16, VGPR_32>;
def : BitConvert <v2bf16, f32, VGPR_32>;
def : BitConvert <f32, v2bf16, SReg_32>;
def : BitConvert <v2bf16, f32, SReg_32>;


// 64-bit bitcast
def : BitConvert <i64, f64, VReg_64>;
Expand Down Expand Up @@ -1958,36 +1975,39 @@ def : GCNPat <
let SubtargetPredicate = HasPackedFP32Ops;
}

foreach fp16vt = [f16, bf16] in {

def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(fcopysign fp16vt:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
>;

def : GCNPat <
(fcopysign f32:$src0, f16:$src1),
(fcopysign f32:$src0, fp16vt:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
(V_LSHLREV_B32_e64 (i32 16), $src1))
>;

def : GCNPat <
(fcopysign f64:$src0, f16:$src1),
(fcopysign f64:$src0, fp16vt:$src1),
(REG_SEQUENCE SReg_64,
(i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), (i32 (EXTRACT_SUBREG $src0, sub1)),
(V_LSHLREV_B32_e64 (i32 16), $src1)), sub1)
>;

def : GCNPat <
(fcopysign f16:$src0, f32:$src1),
(fcopysign fp16vt:$src0, f32:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), $src1))
>;

def : GCNPat <
(fcopysign f16:$src0, f64:$src1),
(fcopysign fp16vt:$src0, f64:$src1),
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
} // End foreach fp16vt = [f16, bf16]

/********** ================== **********/
/********** Immediate Patterns **********/
Expand Down Expand Up @@ -2026,6 +2046,11 @@ def : GCNPat <
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;

def : GCNPat <
(VGPRImm<(bf16 fpimm)>:$imm),
(V_MOV_B32_e32 (bf16 (bitcast_fpimm_to_i32 $imm)))
>;

// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
// immediate and wil be expanded as needed, but we will only use these patterns
// for values which can be encoded.
Expand Down Expand Up @@ -2059,6 +2084,11 @@ def : GCNPat <
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
>;

def : GCNPat <
(bf16 fpimm:$imm),
(S_MOV_B32 (i32 (bitcast_fpimm_to_i32 $imm)))
>;

def : GCNPat <
(p5 frameindex:$fi),
(V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi)))
Expand Down
17,608 changes: 8,083 additions & 9,525 deletions llvm/test/CodeGen/AMDGPU/bf16.ll

Large diffs are not rendered by default.

12 changes: 5 additions & 7 deletions llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1119,10 +1119,9 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s3, s3, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_bfi_b32 v2, s4, v0, v1
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_bfi_b32 v2, s4, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -1133,9 +1132,8 @@ define amdgpu_kernel void @s_test_copysign_f32_fpext_bf16(ptr addrspace(1) %out,
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
; GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_lshl_b32 s3, s3, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s3
; GFX11-NEXT: v_lshlrev_b32_e64 v0, 16, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, s2, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_nop 0
Expand Down
16 changes: 14 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -787,7 +787,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX8-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
Expand All @@ -797,7 +797,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
%arg0.ext = fpext half %arg0 to float
%arg1.ext = fpext half %arg1 to float
Expand Down Expand Up @@ -1021,13 +1021,19 @@ define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
; GFX8-LABEL: fmed3_f32_fpext_bf16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmed3_f32_fpext_bf16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1053,6 +1059,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1062,6 +1069,7 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1087,6 +1095,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1096,6 +1105,7 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1121,6 +1131,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -1130,6 +1141,7 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1243,17 +1243,17 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
; GFX9-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x3020706
; GFX9-NEXT: v_perm_b32 v2, v2, v3, s4
; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v2, v3, v2, s4
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_perm_b32 v2, v2, v3, 0x3020706
; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x3020706
; GFX11-NEXT: v_perm_b32 v2, v3, v2, 0x5040100
; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1875,15 +1875,15 @@ define void @void_func_bf16_inreg(bfloat inreg %arg0) #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: global_store_short_d16_hi v[0:1], v0, off
; GFX9-NEXT: global_store_short v[0:1], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_bf16_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: global_store_d16_hi_b16 v[0:1], v0, off
; GFX11-NEXT: global_store_b16 v[0:1], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store bfloat %arg0, ptr addrspace(1) undef
ret void
Expand Down
167 changes: 78 additions & 89 deletions llvm/test/CodeGen/AMDGPU/function-args.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2818,13 +2818,13 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
Expand All @@ -2833,18 +2833,17 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; VI-NEXT: v_and_b32_e32 v0, 1, v16
; VI-NEXT: v_and_b32_e32 v0, 1, v20
; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_byte v17, off, s[4:7], 0
; VI-NEXT: buffer_store_byte v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v19, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: void_func_v32i32_i1_i8_i16_bf16:
Expand All @@ -2860,13 +2859,14 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[20:23], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_ubyte v20, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_ubyte v16, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_ushort v16, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_ushort v17, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_ushort v18, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_ushort v19, off, s[0:3], s32 offset:20
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
Expand All @@ -2876,33 +2876,32 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; GFX9-NEXT: v_and_b32_e32 v0, 1, v16
; GFX9-NEXT: v_and_b32_e32 v0, 1, v20
; GFX9-NEXT: buffer_store_byte v0, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_byte v17, off, s[4:7], 0
; GFX9-NEXT: buffer_store_byte v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v19, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_i1_i8_i16_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_u8 v33, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:12
; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:16
; GFX11-NEXT: scratch_load_u8 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_u16 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_u16 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_u16 v35, off, s32 offset:16
; GFX11-NEXT: scratch_load_u16 v36, off, s32 offset:20
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
Expand All @@ -2911,9 +2910,8 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_and_b32_e32 v16, 1, v33
; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v32
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_and_b32_e32 v16, 1, v32
; GFX11-NEXT: buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
Expand All @@ -2924,17 +2922,18 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b8 v16, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b8 v33, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: buffer_store_b8 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: buffer_store_b16 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: buffer_store_b16 v35, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b16 v36, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b16 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile i1 %arg1, ptr addrspace(1) undef
Expand Down Expand Up @@ -3166,35 +3165,29 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
; VI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
; VI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20
; VI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
; VI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; VI-NEXT: v_lshrrev_b32_e32 v12, 16, v20
; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v17, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v18, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v19, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v13, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v16, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v12, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_short v20, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
Expand All @@ -3212,55 +3205,45 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:4
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:8
; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:12
; GFX9-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:20
; GFX9-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:16
; GFX9-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; GFX9-NEXT: v_lshrrev_b32_e32 v12, 16, v20
; GFX9-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v17, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v18, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_dword v19, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v13, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v16, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v12, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_store_short v20, off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_clause 0x5
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:16
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12
; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:20
; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:16
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v32
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v33
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
Expand All @@ -3277,6 +3260,12 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: buffer_store_b32 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: buffer_store_b32 v33, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: buffer_store_b32 v34, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
Expand All @@ -3286,14 +3275,6 @@ define void @void_func_v32i32_v2i16_v2f16_v2bf16_v4bf16(<32 x i32> %arg0, <2 x i
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_store_b32 v36, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b16 v38, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b16 v33, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b16 v37, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_store_b16 v32, off, s[0:3], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]
store volatile <32 x i32> %arg0, ptr addrspace(1) undef
store volatile <2 x i16> %arg1, ptr addrspace(1) undef
Expand Down Expand Up @@ -4656,20 +4637,28 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {


define void @void_func_bf16(bfloat %arg0) #0 {
; CIGFX89-LABEL: void_func_bf16:
; CIGFX89: ; %bb.0:
; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CIGFX89-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
; CIGFX89-NEXT: s_mov_b32 s6, -1
; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; CIGFX89-NEXT: s_waitcnt vmcnt(0)
; CIGFX89-NEXT: s_setpc_b64 s[30:31]
; CI-LABEL: void_func_bf16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: void_func_bf16:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_bf16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0
Expand Down
65 changes: 19 additions & 46 deletions llvm/test/CodeGen/AMDGPU/function-returns.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2365,29 +2365,21 @@ define bfloat @bf16_func_void() #0 {
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: bf16_func_void:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: bf16_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: global_load_short_d16_hi v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX89-LABEL: bf16_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: bf16_func_void:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_load_d16_hi_b16 v0, v[0:1], off
; GFX11-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_u16 v0, off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load bfloat, ptr addrspace(1) undef
Expand Down Expand Up @@ -2440,28 +2432,14 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v3bf16_func_void:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s7, 0xf000
; GFX8-NEXT: s_mov_b32 s6, -1
; GFX8-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v3bf16_func_void:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v2, v2, 0, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX89-LABEL: v3bf16_func_void:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-NEXT: s_mov_b32 s7, 0xf000
; GFX89-NEXT: s_mov_b32 s6, -1
; GFX89-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GFX89-NEXT: s_waitcnt vmcnt(0)
; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v3bf16_func_void:
; GFX11: ; %bb.0:
Expand All @@ -2470,11 +2448,6 @@ define <3 x bfloat> @v3bf16_func_void() #0 {
; GFX11-NEXT: s_mov_b32 s2, -1
; GFX11-NEXT: buffer_load_b64 v[0:1], off, s[0:3], 0
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v2, 0xffff, 0, v0
; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
%val = load <3 x bfloat>, ptr addrspace(1) undef
ret <3 x bfloat> %val
Expand Down
128 changes: 48 additions & 80 deletions llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -16599,7 +16599,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
Expand All @@ -16624,7 +16623,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
Expand Down Expand Up @@ -16652,7 +16650,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
Expand Down Expand Up @@ -16681,7 +16678,6 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
Expand Down Expand Up @@ -16943,7 +16939,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
Expand All @@ -16970,7 +16965,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
Expand Down Expand Up @@ -16998,7 +16992,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
Expand Down Expand Up @@ -17027,7 +17020,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
Expand Down Expand Up @@ -17401,19 +17393,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
; GFX9-NEXT: v_writelane_b32 v40, s4, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX9-NEXT: s_lshl_b32 s4, s4, 16
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s4, v40, 0
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
Expand All @@ -17431,19 +17420,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_bf16@abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_bf16@abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
Expand All @@ -17461,20 +17447,17 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s4, 0
; GFX11-NEXT: s_lshl_b32 s4, s4, 16
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s4, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s1
Expand All @@ -17492,19 +17475,16 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_bf16@abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_bf16@abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0
; GFX10-SCRATCH-NEXT: s_lshl_b32 s4, s4, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
Expand Down Expand Up @@ -17755,19 +17735,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
; GFX9-NEXT: v_writelane_b32 v40, s34, 3
; GFX9-NEXT: v_writelane_b32 v40, s5, 0
; GFX9-NEXT: v_writelane_b32 v40, s30, 1
; GFX9-NEXT: s_and_b32 s5, s5, 0xffff
; GFX9-NEXT: v_writelane_b32 v40, s34, 2
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX9-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX9-NEXT: s_addk_i32 s32, 0x400
; GFX9-NEXT: v_writelane_b32 v40, s31, 2
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s31, v40, 2
; GFX9-NEXT: v_readlane_b32 s30, v40, 1
; GFX9-NEXT: v_readlane_b32 s5, v40, 0
; GFX9-NEXT: v_readlane_b32 s34, v40, 3
; GFX9-NEXT: v_readlane_b32 s31, v40, 1
; GFX9-NEXT: v_readlane_b32 s30, v40, 0
; GFX9-NEXT: v_readlane_b32 s34, v40, 2
; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[36:37]
Expand All @@ -17785,19 +17762,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s35
; GFX10-NEXT: v_writelane_b32 v40, s34, 3
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: s_mov_b32 s35, external_void_func_v3bf16@abs32@hi
; GFX10-NEXT: s_mov_b32 s34, external_void_func_v3bf16@abs32@lo
; GFX10-NEXT: s_addk_i32 s32, 0x200
; GFX10-NEXT: v_writelane_b32 v40, s5, 0
; GFX10-NEXT: s_and_b32 s5, s5, 0xffff
; GFX10-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX10-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-NEXT: v_readlane_b32 s5, v40, 0
; GFX10-NEXT: v_readlane_b32 s34, v40, 3
; GFX10-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-NEXT: v_readlane_b32 s34, v40, 2
; GFX10-NEXT: s_or_saveexec_b32 s35, -1
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
Expand All @@ -17815,20 +17789,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
; GFX11-NEXT: s_mov_b32 exec_lo, s1
; GFX11-NEXT: v_writelane_b32 v40, s0, 3
; GFX11-NEXT: v_writelane_b32 v40, s0, 2
; GFX11-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX11-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX11-NEXT: s_add_i32 s32, s32, 16
; GFX11-NEXT: v_writelane_b32 v40, s5, 0
; GFX11-NEXT: s_and_b32 s5, s5, 0xffff
; GFX11-NEXT: v_writelane_b32 v40, s30, 1
; GFX11-NEXT: v_writelane_b32 v40, s31, 2
; GFX11-NEXT: v_writelane_b32 v40, s30, 0
; GFX11-NEXT: v_writelane_b32 v40, s31, 1
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readlane_b32 s31, v40, 2
; GFX11-NEXT: v_readlane_b32 s30, v40, 1
; GFX11-NEXT: v_readlane_b32 s5, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 3
; GFX11-NEXT: v_readlane_b32 s31, v40, 1
; GFX11-NEXT: v_readlane_b32 s30, v40, 0
; GFX11-NEXT: v_readlane_b32 s0, v40, 2
; GFX11-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
; GFX11-NEXT: s_mov_b32 exec_lo, s1
Expand All @@ -17846,19 +17817,16 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2
; GFX10-SCRATCH-NEXT: s_mov_b32 s1, external_void_func_v3bf16@abs32@hi
; GFX10-SCRATCH-NEXT: s_mov_b32 s0, external_void_func_v3bf16@abs32@lo
; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 0
; GFX10-SCRATCH-NEXT: s_and_b32 s5, s5, 0xffff
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3
; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1
; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0
; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2
; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3
Expand Down
16 changes: 6 additions & 10 deletions llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -613,11 +613,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX11-NEXT: {{ $}}
; DAGISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
; DAGISEL-GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; DAGISEL-GFX11-NEXT: [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
; DAGISEL-GFX11-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
; DAGISEL-GFX11-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX11-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
Expand All @@ -629,11 +627,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
; DAGISEL-GFX10-NEXT: {{ $}}
; DAGISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
; DAGISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -65536
; DAGISEL-GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
; DAGISEL-GFX10-NEXT: [[S_PACK_LH_B32_B16_:%[0-9]+]]:sreg_32 = S_PACK_LH_B32_B16 killed [[S_MOV_B32_1]], [[COPY1]]
; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_PACK_LH_B32_B16_]], 0, killed [[V_AND_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX10-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec
; DAGISEL-GFX10-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc
; DAGISEL-GFX10-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
; DAGISEL-GFX10-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_ADD_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
Expand Down
25 changes: 13 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.exp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5731,19 +5731,19 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
; VI-LABEL: v_exp_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-NEXT: v_sub_f32_e32 v4, v0, v1
; VI-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v1
; VI-NEXT: v_mul_f32_e32 v5, 0x39a3b295, v4
; VI-NEXT: v_mul_f32_e32 v4, 0x3fb8a000, v4
; VI-NEXT: v_rndne_f32_e32 v3, v2
; VI-NEXT: v_add_f32_e32 v4, v4, v5
; VI-NEXT: v_mul_f32_e32 v1, 0x39a3b295, v1
; VI-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-NEXT: v_add_f32_e32 v1, v1, v4
; VI-NEXT: v_add_f32_e32 v1, v2, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_sub_f32_e32 v3, v0, v0
; VI-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v0
; VI-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v3
; VI-NEXT: v_mul_f32_e32 v3, 0x3fb8a000, v3
; VI-NEXT: v_rndne_f32_e32 v2, v1
; VI-NEXT: v_add_f32_e32 v3, v3, v4
; VI-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
; VI-NEXT: v_sub_f32_e32 v1, v1, v2
; VI-NEXT: v_add_f32_e32 v3, v4, v3
; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_exp_f32_e32 v1, v1
; VI-NEXT: v_cvt_i32_f32_e32 v2, v3
; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
; VI-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; VI-NEXT: s_mov_b32 s4, 0x42b17218
Expand All @@ -5757,6 +5757,7 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
; GFX900-LABEL: v_exp_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX900-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GFX900-NEXT: v_rndne_f32_e32 v2, v1
Expand Down
25 changes: 13 additions & 12 deletions llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5809,19 +5809,19 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
; VI-LABEL: v_exp10_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xfffff000, v0
; VI-NEXT: v_sub_f32_e32 v4, v0, v1
; VI-NEXT: v_mul_f32_e32 v2, 0x40549000, v1
; VI-NEXT: v_mul_f32_e32 v5, 0x3a2784bc, v4
; VI-NEXT: v_mul_f32_e32 v4, 0x40549000, v4
; VI-NEXT: v_rndne_f32_e32 v3, v2
; VI-NEXT: v_add_f32_e32 v4, v4, v5
; VI-NEXT: v_mul_f32_e32 v1, 0x3a2784bc, v1
; VI-NEXT: v_sub_f32_e32 v2, v2, v3
; VI-NEXT: v_add_f32_e32 v1, v1, v4
; VI-NEXT: v_add_f32_e32 v1, v2, v1
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_sub_f32_e32 v3, v0, v0
; VI-NEXT: v_mul_f32_e32 v1, 0x40549000, v0
; VI-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v3
; VI-NEXT: v_mul_f32_e32 v3, 0x40549000, v3
; VI-NEXT: v_rndne_f32_e32 v2, v1
; VI-NEXT: v_add_f32_e32 v3, v3, v4
; VI-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
; VI-NEXT: v_sub_f32_e32 v1, v1, v2
; VI-NEXT: v_add_f32_e32 v3, v4, v3
; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_exp_f32_e32 v1, v1
; VI-NEXT: v_cvt_i32_f32_e32 v2, v3
; VI-NEXT: v_cvt_i32_f32_e32 v2, v2
; VI-NEXT: s_mov_b32 s4, 0xc23369f4
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; VI-NEXT: s_mov_b32 s4, 0x421a209b
Expand All @@ -5835,6 +5835,7 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
; GFX900-LABEL: v_exp10_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX900-NEXT: s_mov_b32 s4, 0x40549a78
; GFX900-NEXT: v_rndne_f32_e32 v2, v1
Expand Down
58 changes: 43 additions & 15 deletions llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1968,19 +1968,49 @@ define float @v_exp2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
}

define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
; GCN-LABEL: v_exp2_f32_from_fpext_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0x42800000
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v2
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: v_exp2_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0xc2fc0000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; SI-NEXT: v_mov_b32_e32 v2, 0x42800000
; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; SI-NEXT: v_add_f32_e32 v0, v0, v2
; SI-NEXT: v_exp_f32_e32 v0, v0
; SI-NEXT: v_mov_b32_e32 v1, 0x1f800000
; SI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_exp2_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_mov_b32 s4, 0xc2fc0000
; VI-NEXT: v_mov_b32_e32 v1, 0x42800000
; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-NEXT: v_add_f32_e32 v0, v0, v1
; VI-NEXT: v_exp_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x1f800000
; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_exp2_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x42800000
; GFX900-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GFX900-NEXT: v_add_f32_e32 v0, v0, v2
; GFX900-NEXT: v_exp_f32_e32 v0, v0
; GFX900-NEXT: v_mov_b32_e32 v1, 0x1f800000
; GFX900-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; GFX900-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp2_f32_from_fpext_bf16:
; R600: ; %bb.0:
Expand Down Expand Up @@ -2936,5 +2966,3 @@ declare <3 x half> @llvm.exp2.v3f16(<3 x half>) #2
attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SI: {{.*}}
402 changes: 113 additions & 289 deletions llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.log.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6234,6 +6234,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
; VI-LABEL: v_log_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_mov_b32 s4, 0x800000
; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
Expand All @@ -6260,6 +6261,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
; GFX900-LABEL: v_log_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: s_mov_b32 s4, 0x800000
; GFX900-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
Expand All @@ -6283,22 +6285,23 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
; GFX1100-LABEL: v_log_f32_from_fpext_bf16:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-NEXT: v_log_f32_e32 v0, v0
; GFX1100-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0
; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1
; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2
; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
11 changes: 7 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.log10.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6234,6 +6234,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
; VI-LABEL: v_log10_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_mov_b32 s4, 0x800000
; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
Expand All @@ -6260,6 +6261,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
; GFX900-LABEL: v_log10_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: s_mov_b32 s4, 0x800000
; GFX900-NEXT: v_mov_b32_e32 v1, 0x4f800000
; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
Expand All @@ -6283,22 +6285,23 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
; GFX1100-LABEL: v_log10_f32_from_fpext_bf16:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX1100-NEXT: v_log_f32_e32 v0, v0
; GFX1100-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0
; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1
; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2
; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX1100-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
62 changes: 46 additions & 16 deletions llvm/test/CodeGen/AMDGPU/llvm.log2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2727,28 +2727,60 @@ define float @v_log2_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
}

define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
; GFX689-LABEL: v_log2_f32_from_fpext_bf16:
; GFX689: ; %bb.0:
; GFX689-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX689-NEXT: s_mov_b32 s4, 0x800000
; GFX689-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX689-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX689-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; GFX689-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX689-NEXT: v_log_f32_e32 v0, v0
; GFX689-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX689-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX689-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX689-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: v_log2_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, 0x800000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; SI-NEXT: v_mov_b32_e32 v2, 0x4f800000
; SI-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; SI-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-NEXT: v_log_f32_e32 v0, v0
; SI-NEXT: v_mov_b32_e32 v1, 0x42000000
; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; SI-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log2_f32_from_fpext_bf16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: s_mov_b32 s4, 0x800000
; VI-NEXT: v_mov_b32_e32 v1, 0x4f800000
; VI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
; VI-NEXT: v_mul_f32_e32 v0, v0, v1
; VI-NEXT: v_log_f32_e32 v0, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x42000000
; VI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; VI-NEXT: v_sub_f32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: v_log2_f32_from_fpext_bf16:
; GFX900: ; %bb.0:
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX900-NEXT: s_mov_b32 s4, 0x800000
; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX900-NEXT: v_mov_b32_e32 v2, 0x4f800000
; GFX900-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
; GFX900-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX900-NEXT: v_log_f32_e32 v0, v0
; GFX900-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: v_log2_f32_from_fpext_bf16:
; GFX1100: ; %bb.0:
; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1100-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1100-NEXT: v_log_f32_e32 v0, v0
; GFX1100-NEXT: s_waitcnt_depctr 0xfff
; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1
Expand Down Expand Up @@ -3973,5 +4005,3 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #2
attributes #0 = { "denormal-fp-math-f32"="ieee,preserve-sign" }
attributes #1 = { "denormal-fp-math-f32"="dynamic,dynamic" }
attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; SI: {{.*}}
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1422,7 +1422,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; VI-NEXT: s_cbranch_execnz .LBB10_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_lshrrev_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_lshrrev_b32_e32 v0, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
Expand Down Expand Up @@ -1452,7 +1452,7 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX9-NEXT: s_cbranch_execnz .LBB10_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_lshrrev_b32_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_lshrrev_b32_e32 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/select-undef.ll
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,9 @@ ret:
ret void
}

; FIXME: This shouldn't have the 0 initialization
; GCN-LABEL: {{^}}undef_v3bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
Expand Down
850 changes: 258 additions & 592 deletions llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
OptionalAttr<I64Attr>:$alignment,
UnitAttr:$volatile_,
UnitAttr:$nontemporal,
UnitAttr:$invariant,
DefaultValuedAttr<
AtomicOrdering, "AtomicOrdering::not_atomic">:$ordering,
OptionalAttr<StrAttr>:$syncscope);
Expand Down Expand Up @@ -364,11 +365,16 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
let assemblyFormat = [{
(`volatile` $volatile_^)? $addr
(`atomic` (`syncscope` `(` $syncscope^ `)`)? $ordering^)?
(`invariant` $invariant^)?
attr-dict `:` qualified(type($addr)) `->` type($res)
}];
string llvmBuilder = [{
auto *inst = builder.CreateLoad($_resultType, $addr, $volatile_);
$res = inst;
if ($invariant) {
llvm::MDNode *metadata = llvm::MDNode::get(inst->getContext(), std::nullopt);
inst->setMetadata(llvm::LLVMContext::MD_invariant_load, metadata);
}
}] # setOrderingCode
# setSyncScopeCode
# setAlignmentCode
Expand All @@ -381,13 +387,14 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
$res = $_builder.create<LLVM::LoadOp>($_location, $_resultType, $addr,
alignment, loadInst->isVolatile(),
loadInst->hasMetadata(llvm::LLVMContext::MD_nontemporal),
loadInst->hasMetadata(llvm::LLVMContext::MD_invariant_load),
convertAtomicOrderingFromLLVM(loadInst->getOrdering()),
getLLVMSyncScope(loadInst));
}];
let builders = [
OpBuilder<(ins "Type":$type, "Value":$addr,
CArg<"unsigned", "0">:$alignment, CArg<"bool", "false">:$isVolatile,
CArg<"bool", "false">:$isNonTemporal,
CArg<"bool", "false">:$isNonTemporal, CArg<"bool", "false">:$isInvariant,
CArg<"AtomicOrdering", "AtomicOrdering::not_atomic">:$ordering,
CArg<"StringRef", "StringRef()">:$syncscope)>
];
Expand Down
6 changes: 3 additions & 3 deletions mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -822,11 +822,11 @@ LogicalResult LoadOp::verify() {

void LoadOp::build(OpBuilder &builder, OperationState &state, Type type,
Value addr, unsigned alignment, bool isVolatile,
bool isNonTemporal, AtomicOrdering ordering,
StringRef syncscope) {
bool isNonTemporal, bool isInvariant,
AtomicOrdering ordering, StringRef syncscope) {
build(builder, state, type, addr,
alignment ? builder.getI64IntegerAttr(alignment) : nullptr, isVolatile,
isNonTemporal, ordering,
isNonTemporal, isInvariant, ordering,
syncscope.empty() ? nullptr : builder.getStringAttr(syncscope),
/*access_groups=*/nullptr,
/*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr,
Expand Down
7 changes: 7 additions & 0 deletions mlir/test/Dialect/LLVMIR/roundtrip.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,13 @@ func.func @cmpxchg(%ptr : !llvm.ptr, %cmp : i32, %new : i32) {
llvm.return
}

// CHECK-LABEL: @invariant_load
func.func @invariant_load(%ptr : !llvm.ptr) -> i32 {
// CHECK: llvm.load %{{.+}} invariant {alignment = 4 : i64} : !llvm.ptr -> i32
%0 = llvm.load %ptr invariant {alignment = 4 : i64} : !llvm.ptr -> i32
func.return %0 : i32
}

llvm.mlir.global external constant @_ZTIi() : !llvm.ptr
llvm.func @bar(!llvm.ptr, !llvm.ptr, !llvm.ptr)
llvm.func @__gxx_personality_v0(...) -> i32
Expand Down
13 changes: 13 additions & 0 deletions mlir/test/Target/LLVMIR/Import/instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,19 @@ define void @load_store(ptr %ptr) {

; // -----

; CHECK-LABEL: @invariant_load
; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]]
define float @invariant_load(ptr %ptr) {
; CHECK: %[[V:[0-9]+]] = llvm.load %[[PTR]] invariant {alignment = 4 : i64} : !llvm.ptr -> f32
%1 = load float, ptr %ptr, align 4, !invariant.load !0
; CHECK: llvm.return %[[V]]
ret float %1
}

!0 = !{}

; // -----

; CHECK-LABEL: @atomic_load_store
; CHECK-SAME: %[[PTR:[a-zA-Z0-9]+]]
define void @atomic_load_store(ptr %ptr) {
Expand Down
11 changes: 11 additions & 0 deletions mlir/test/Target/LLVMIR/llvmir.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -1911,6 +1911,17 @@ llvm.func @nontemporal_store_and_load() {

// -----

// Check that invariantLoad attribute is exported as metadata node.
llvm.func @nontemporal_store_and_load(%ptr : !llvm.ptr) -> i32 {
// CHECK: !invariant.load ![[NODE:[0-9]+]]
%1 = llvm.load %ptr invariant : !llvm.ptr -> i32
llvm.return %1 : i32
}

// CHECK: ![[NODE]] = !{}

// -----

llvm.func @atomic_store_and_load(%ptr : !llvm.ptr) {
// CHECK: load atomic
// CHECK-SAME: acquire, align 4
Expand Down