-
Notifications
You must be signed in to change notification settings - Fork 11.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16] Support VOP3 source DPP operands. #80892
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Ivan Kosarev (kosarev) ChangesPatch is 56.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80892.diff 15 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781588668f..a94da992b33859 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -314,8 +314,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
- bool isRegOrInlineImmWithFP16InputMods() const {
- return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+ template <bool IsFake16> bool isRegOrInlineImmWithFP16InputMods() const {
+ return isRegOrInline(
+ IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16);
}
bool isRegOrInlineImmWithFP32InputMods() const {
@@ -8151,7 +8152,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
// Determines which bit DST_OP_SEL occupies in the op_sel operand according to
// the number of src operands present, then copies that bit into src0_modifiers.
-void cvtVOP3DstOpSelOnly(MCInst &Inst) {
+static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) {
int Opc = Inst.getOpcode();
int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
if (OpSelIdx == -1)
@@ -8168,23 +8169,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) {
unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
- if ((OpSel & (1 << SrcNum)) != 0) {
- int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
- uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
- Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+ int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+ if (DstIdx == -1)
+ return;
+
+ const MCOperand &DstOp = Inst.getOperand(DstIdx);
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+ uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+ if (DstOp.isReg() &&
+ MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) {
+ if (AMDGPU::isHi(DstOp.getReg(), MRI))
+ ModVal |= SISrcMods::DST_OP_SEL;
+ } else {
+ if ((OpSel & (1 << SrcNum)) != 0)
+ ModVal |= SISrcMods::DST_OP_SEL;
}
+ Inst.getOperand(ModIdx).setImm(ModVal);
}
void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
const OperandVector &Operands) {
cvtVOP3P(Inst, Operands);
- cvtVOP3DstOpSelOnly(Inst);
+ cvtVOP3DstOpSelOnly(Inst, *getMRI());
}
void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx) {
cvtVOP3P(Inst, Operands, OptionalIdx);
- cvtVOP3DstOpSelOnly(Inst);
+ cvtVOP3DstOpSelOnly(Inst, *getMRI());
}
static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
@@ -8433,8 +8445,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
uint32_t ModVal = 0;
- if ((OpSel & (1 << J)) != 0)
- ModVal |= SISrcMods::OP_SEL_0;
+ const MCOperand &SrcOp = Inst.getOperand(OpIdx);
+ if (SrcOp.isReg() && getMRI()
+ ->getRegClass(AMDGPU::VGPR_16RegClassID)
+ .contains(SrcOp.getReg())) {
+ bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI());
+ if (VGPRSuffixIsHi)
+ ModVal |= SISrcMods::OP_SEL_0;
+ } else {
+ if ((OpSel & (1 << J)) != 0)
+ ModVal |= SISrcMods::OP_SEL_0;
+ }
if ((OpSelHi & (1 << J)) != 0)
ModVal |= SISrcMods::OP_SEL_1;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fba9eb53c8a8b4..2f48b2e3026300 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -913,6 +913,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI,
return Modifiers;
}
+// Instructions decode the op_sel/suffix bits into the src_modifier
+// operands. Copy those bits into the src operands for true16 VGPRs.
+void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
+ const unsigned Opc = MI.getOpcode();
+ const MCRegisterClass &ConversionRC =
+ MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
+ constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
+ {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
+ SISrcMods::OP_SEL_0},
+ {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+ SISrcMods::DST_OP_SEL}}};
+ for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
+ int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
+ if (OpIdx == -1 || OpModsIdx == -1)
+ continue;
+ MCOperand &Op = MI.getOperand(OpIdx);
+ if (!Op.isReg())
+ continue;
+ if (!ConversionRC.contains(Op.getReg()))
+ continue;
+ unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
+ const MCOperand &OpMods = MI.getOperand(OpModsIdx);
+ unsigned ModVal = OpMods.getImm();
+ if (ModVal & OpSelMask) { // isHi
+ unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
+ }
+ }
+}
+
// MAC opcodes have special old and src2 operands.
// src2 is tied to dst, while old is not tied (but assumed to be).
bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
@@ -991,6 +1026,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);
+ convertTrue16OpSel(MI);
+
int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5a89b30f6fb36a..02feaf553c0c45 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -203,6 +203,7 @@ class AMDGPUDisassembler : public MCDisassembler {
DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
void convertMacDPPInst(MCInst &MI) const;
+ void convertTrue16OpSel(MCInst &MI) const;
enum OpWidthTy {
OPW32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc61500cc..8bf05682cbe7ea 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand(
int UseOpIdx,
SmallVectorImpl<FoldCandidate> &FoldList,
SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
- const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+ const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
- if (!isUseSafeToFold(*UseMI, UseOp))
+ if (!isUseSafeToFold(*UseMI, *UseOp))
return;
// FIXME: Fold operands with subregs.
- if (UseOp.isReg() && OpToFold.isReg() &&
- (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
+ if (UseOp->isReg() && OpToFold.isReg() &&
+ (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
return;
// Special case for REG_SEQUENCE: We can't fold literals into
@@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand(
if (MovOp == AMDGPU::COPY)
return;
- UseMI->setDesc(TII->get(MovOp));
MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
while (ImpOpI != ImpOpE) {
@@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand(
ImpOpI++;
UseMI->removeOperand(UseMI->getOperandNo(Tmp));
}
+ UseMI->setDesc(TII->get(MovOp));
+
+ if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+ const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+ MachineOperand NewSrcOp(SrcOp);
+ MachineFunction *MF = UseMI->getParent()->getParent();
+ UseMI->removeOperand(1);
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+ UseMI->addOperand(NewSrcOp); // src0
+ UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+ UseOpIdx = 2;
+ UseOp = &UseMI->getOperand(UseOpIdx);
+ }
CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
@@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand(
// Don't fold into target independent nodes. Target independent opcodes
// don't have defined register classes.
- if (UseDesc.isVariadic() || UseOp.isImplicit() ||
+ if (UseDesc.isVariadic() || UseOp->isImplicit() ||
UseDesc.operands()[UseOpIdx].RegClass == -1)
return;
}
@@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand(
TRI->getRegClass(FoldDesc.operands()[0].RegClass);
// Split 64-bit constants into 32-bits for folding.
- if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
- Register UseReg = UseOp.getReg();
+ if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
+ Register UseReg = UseOp->getReg();
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(*UseRC) != 64)
return;
APInt Imm(64, OpToFold.getImm());
- if (UseOp.getSubReg() == AMDGPU::sub0) {
+ if (UseOp->getSubReg() == AMDGPU::sub0) {
Imm = Imm.getLoBits(32);
} else {
- assert(UseOp.getSubReg() == AMDGPU::sub1);
+ assert(UseOp->getSubReg() == AMDGPU::sub1);
Imm = Imm.getHiBits(32);
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7edec5a7a5505b..22599773d562cb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
-def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+class FP16VCSrcInputModsMatchClass<bit IsFake16>
+ : FPVCSrcInputModsMatchClass<16> {
+ let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods",
+ "RegOrInlineImmWithFPT16InputMods");
+ let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" #
+ !if(IsFake16, "true", "false") # ">";
+}
def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
class InputMods <AsmOperandClass matchClass> : Operand <i32> {
@@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
-def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+class FP16VCSrcInputMods<bit IsFake16>
+ : FPInputMods<FP16VCSrcInputModsMatchClass<IsFake16>>;
def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
class IntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
}
// Return type of input modifiers operand for specified input operand for DPP
-class getSrcModVOP3DPP <ValueType VT> {
+class getSrcModVOP3DPP <ValueType VT, bit IsFake16 = 1> {
Operand ret =
!if (VT.isFP,
!if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
- FP16VCSrcInputMods, FP32VCSrcInputMods),
+ FP16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
Int32VCSrcInputMods);
}
@@ -2450,6 +2457,10 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
+
+ let HasOpSel = 1;
+ let HasModifiers = 1; // All instructions at least have OpSel.
+
// Most DstVT are 16-bit, but not all.
let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -2461,6 +2472,10 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
+ let Src0VOP3DPP = VGPRSrc_16;
+ let Src0ModVOP3DPP = getSrcModVOP3DPP<Src0VT, 0 /*IsFake16*/>.ret;
+ let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
+ let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c9dbe02037ef2e..aabb6c29062114 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
let EncoderMethod = "getMachineOpValueT16Lo128";
}
+// True 16 operands.
+def VGPRSrc_16 : RegisterOperand<VGPR_16> {
+ let DecoderMethod = "DecodeVGPR_16RegisterClass";
+ let EncoderMethod = "getMachineOpValueT16";
+}
+
//===----------------------------------------------------------------------===//
// ASrc_* Operands with an AccVGPR
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 84da311108ce38..014534ab79fe64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -50,7 +50,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
@@ -88,7 +88,7 @@ body: |
; GFX11: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
;
@@ -127,7 +127,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 30975a8937db62..dcf9e169f586be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -59,7 +59,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
@@ -97,7 +97,7 @@ body: |
; GFX11: liveins: $sgpr0
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
;
@@ -136,7 +136,7 @@ body: |
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
- ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 7767aa54c81519..9ae5f559e860af 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -66,7 +66,7 @@ body: |
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+ ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: ceil_f16
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -87,7 +87,7 @@ body: |
; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
- ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+ ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
;
; FAKE16-LABEL: name: floor_f16
; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
new file mode 100644
index 00000000000000..1871a41ec5983e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
@@ -0,0 +1,85 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x...
[truncated]
|
Added diasm tests on |
SISrcMods::OP_SEL_0}, | ||
{AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers, | ||
SISrcMods::DST_OP_SEL}}}; | ||
for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need the reference
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AMDGPUDisassembler.cpp:931:8: note: use reference type 'const std::array<std::tuple<int, int, unsigned int>, 4>::value_type &' (aka 'const std::tuple<int, int, unsigned int> &') to prevent copying
for (const auto [OpName, OpModsName, OpSelMask] : OpAndOpMods) {
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
&
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks!
[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. Also add missing v_ceil/floor_f16 tests. Includes #80892.
No description provided.