Skip to content

Commit

Permalink
[AMDGPU] Support byte_sel modifier on v_cvt_sr_fp8_f32 and v_cvt_sr_b…
Browse files Browse the repository at this point in the history
…f8_f32 (#90244)
  • Loading branch information
rampitec committed Apr 26, 2024
1 parent 300340f commit 6e722bb
Show file tree
Hide file tree
Showing 17 changed files with 256 additions and 42 deletions.
36 changes: 29 additions & 7 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
ImmTyWaitEXP,
ImmTyWaitVAVDst,
ImmTyWaitVMVSrc,
ImmTyByteSel,
};

// Immediate operand kind.
Expand Down Expand Up @@ -410,6 +411,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
bool isByteSel() const {
return isImmTy(ImmTyByteSel) && isUInt<2>(getImm());
}

bool isRegOrImm() const {
return isReg() || isImm();
Expand Down Expand Up @@ -1139,6 +1143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
case ImmTyWaitEXP: OS << "WaitEXP"; break;
case ImmTyWaitVAVDst: OS << "WaitVAVDst"; break;
case ImmTyWaitVMVSrc: OS << "WaitVMVSrc"; break;
case ImmTyByteSel: OS << "ByteSel" ; break;
}
// clang-format on
}
Expand Down Expand Up @@ -8644,6 +8649,13 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
Inst.addOperand(Inst.getOperand(0));
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyClampSI);
Expand Down Expand Up @@ -8680,8 +8692,8 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,

if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
Inst.addOperand(Inst.getOperand(0));
}
Expand All @@ -8692,7 +8704,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12)) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
}
Expand Down Expand Up @@ -9207,10 +9223,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
Inst.addOperand(Inst.getOperand(0));
}

bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
bool IsVOP3CvtSrDpp =
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12;
if (IsVOP3CvtSrDpp) {
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(MCOperand::createImm(0));
Expand Down Expand Up @@ -9243,6 +9260,11 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
llvm_unreachable("unhandled operand type");
}
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);

Expand Down
8 changes: 0 additions & 8 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -869,10 +869,6 @@ void AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
Expand Down Expand Up @@ -902,10 +898,6 @@ void AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
if (NegHiOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::neg_hi)) {
DPPInst.addImm(NegHiOpr->getImm());
}
auto *ByteSelOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::byte_sel);
if (ByteSelOpr &&
AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::byte_sel)) {
DPPInst.addImm(ByteSelOpr->getImm());
}
}
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1806,4 +1806,14 @@ void AMDGPUInstPrinter::printEndpgm(const MCInst *MI, unsigned OpNo,
O << ' ' << formatDec(Imm);
}

void AMDGPUInstPrinter::printByteSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
uint8_t Imm = MI->getOperand(OpNo).getImm();
if (!Imm)
return;

O << " byte_sel:" << formatDec(Imm);
}

#include "AMDGPUGenAsmWriter.inc"
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,8 @@ class AMDGPUInstPrinter : public MCInstPrinter {
const MCSubtargetInfo &STI, raw_ostream &O);
void printExpTgt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printByteSel(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);

public:
static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
Expand Down
17 changes: 11 additions & 6 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1123,6 +1123,8 @@ def WaitEXP : NamedIntOperand<i8, "wait_exp">;
def WaitVAVDst : NamedIntOperand<i8, "wait_va_vdst">;
def WaitVMVSrc : NamedIntOperand<i8, "wait_vm_vsrc">;

def ByteSel : NamedIntOperand<i8, "byte_sel">;

class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
Expand Down Expand Up @@ -1700,9 +1702,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod0:$clamp, omod0:$omod),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1,
clampmod0:$clamp))
!con((ins Src0Mod:$src0_modifiers, Src0RC:$src0,
Src1Mod:$src1_modifiers, Src1RC:$src1),
!if(HasClamp, (ins clampmod0:$clamp), (ins))))
/* else */,
// VOP2 without modifiers
!if (HasClamp,
Expand Down Expand Up @@ -2036,7 +2038,8 @@ class getAsmDPP8 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT
class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
bit HasOpSel, bit HasOMod, bit IsVOP3P,
bit HasModifiers, bit Src0HasMods,
bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32> {
bit Src1HasMods, bit Src2HasMods, ValueType DstVT = i32,
bit HasByteSel = 0> {
string dst = !if(HasDst,
!if(!eq(DstVT.Size, 1),
"$sdst",
Expand All @@ -2058,14 +2061,15 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
string src1 = !if(Src1HasMods, src1mods, src1nomods);
string src2 = !if(Src2HasMods, src2mods, src2nomods);
string opsel = !if(HasOpSel, "$op_sel", "");
string bytesel = !if(HasByteSel, "$byte_sel", "");
string 3PMods = !if(IsVOP3P,
!if(HasOpSel, "$op_sel_hi", "")
#!if(HasModifiers, "$neg_lo$neg_hi", ""),
"");
string clamp = !if(HasClamp, "$clamp", "");
string omod = !if(HasOMod, "$omod", "");

string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#3PMods#clamp#omod, "");
string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");

}

Expand Down Expand Up @@ -2282,6 +2286,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsSWMMAC = 0;

field bit IsFP8 = 0;
field bit IsFP8DstByteSel = 0;

field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
Expand Down Expand Up @@ -2401,7 +2406,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
HasModifiers, DstVT>.ret;
HasModifiers, DstVT, IsFP8DstByteSel>.ret;
field string Asm64 = AsmVOP3Base;
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
Expand Down
46 changes: 40 additions & 6 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,22 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
HasSrc2FloatMods>.ret>.ret);
}

class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
let IsFP8DstByteSel = 1;
let HasClamp = 0;
defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasClamp, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
bytesel);
let InsVOP3Base = !con(
getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
Src2VOP3DPP, NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods, HasOMod,
Src0ModVOP3DPP, Src1ModVOP3DPP, Src2ModVOP3DPP, HasOpSel>.ret,
bytesel);
}

def IsPow2Plus1: PatLeaf<(i32 imm), [{
uint32_t V = N->getZExtValue();
return isPowerOf2_32(V - 1);
Expand Down Expand Up @@ -645,12 +661,17 @@ let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;

let SubtargetPredicate = isGFX12Plus in {
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3Inst<"v_cvt_sr_bf8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
}
}

// These instructions have non-standard use of op_sel. In particular they are
// using op_sel bits 2 and 3 while only having two sources. Therefore dummy
// src2 is used to hold the op_sel value.
let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
let Constraints = "$vdst = $src2", DisableEncoding = "$src2", SubtargetPredicate = isGFX940Plus in {
defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
}
Expand All @@ -667,15 +688,28 @@ class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst>
!if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
>;

class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType SrcVT> : GCNPat<
(i32 (node (VOP3Mods SrcVT:$src0, i32:$src0_modifiers), (VOP3Mods i32:$src1, i32:$src1_modifiers),
i32:$old, timm:$byte_sel)),
(inst $src0_modifiers, $src0, $src1_modifiers, $src1, $old, (as_i32timm $byte_sel))
>;

let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}

foreach Index = [0, 1, 2, 3] in {
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
let SubtargetPredicate = isGFX940Plus in {
foreach Index = [0, 1, 2, 3] in {
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
}
}

let SubtargetPredicate = isGFX12Plus in {
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_fp8_f32, V_CVT_SR_FP8_F32_gfx12_e64, f32>;
def : Cvt_SR_F8_ByteSel_Pat<int_amdgcn_cvt_sr_bf8_f32, V_CVT_SR_BF8_F32_gfx12_e64, f32>;
}
}

Expand Down Expand Up @@ -1040,8 +1074,8 @@ defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;

defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
defm V_CVT_SR_FP8_F32 : VOP3Only_Realtriple_gfx12<0x36b>;
defm V_CVT_SR_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36c>;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;

//===----------------------------------------------------------------------===//
// GFX11, GFX12
Expand Down
23 changes: 20 additions & 3 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,14 @@ class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
}

class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<2> byte_sel;

let Inst{11} = 0; // op_sel0
let Inst{12} = 0; // op_sel1
let Inst{14-13} = byte_sel; // op_sel2/3
}

class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
let Inst{11} = ?;
let Inst{12} = ?;
Expand Down Expand Up @@ -741,15 +749,16 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
bits<3> src2_modifiers;
bits<1> clamp;
bits<2> omod;
bits<2> byte_sel;

let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
let Inst{15} = !if(P.HasClamp, clamp, 0);
let Inst{25-16} = op;
let Inst{31-26} = 0x35;
Expand Down Expand Up @@ -1388,7 +1397,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
if ps.Pfl.HasOpSel then {
if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
} if ps.Pfl.HasOpSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
Expand Down Expand Up @@ -1419,6 +1432,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.HasOpSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
Expand Down
8 changes: 2 additions & 6 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand All @@ -114,9 +112,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
; GFX12-NEXT: v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: global_store_b32 v[3:4], v2, off
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
Expand Down
Loading

0 comments on commit 6e722bb

Please sign in to comment.