[AMDGPU][True16] Support VOP3 source DPP operands. #80892

kosarev · 2024-02-06T18:34:35Z

No description provided.

llvmbot · 2024-02-06T18:35:06Z

@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-mc

@llvm/pr-subscribers-llvm-globalisel

Author: Ivan Kosarev (kosarev)

Changes

Patch is 56.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80892.diff

15 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+32-11)
(modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+37)
(modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h (+1)
(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+22-10)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+19-4)
(modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+6)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir (+3-3)
(modified) llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir (+2-2)
(added) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s (+85)
(modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1.s (+35-29)
(added) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1-fake16.s (+25)
(modified) llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vop1.s (+15-9)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt (+58-29)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vop1.txt (+18-9)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 225e781588668f..a94da992b33859 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -314,8 +314,9 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrImmWithInputMods(AMDGPU::VS_64RegClassID, MVT::f64);
   }
 
-  bool isRegOrInlineImmWithFP16InputMods() const {
-    return isRegOrInline(AMDGPU::VS_32RegClassID, MVT::f16);
+  template <bool IsFake16> bool isRegOrInlineImmWithFP16InputMods() const {
+    return isRegOrInline(
+        IsFake16 ? AMDGPU::VS_32RegClassID : AMDGPU::VS_16RegClassID, MVT::f16);
   }
 
   bool isRegOrInlineImmWithFP32InputMods() const {
@@ -8151,7 +8152,7 @@ ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
 
 // Determines which bit DST_OP_SEL occupies in the op_sel operand according to
 // the number of src operands present, then copies that bit into src0_modifiers.
-void cvtVOP3DstOpSelOnly(MCInst &Inst) {
+static void cvtVOP3DstOpSelOnly(MCInst &Inst, const MCRegisterInfo &MRI) {
   int Opc = Inst.getOpcode();
   int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
   if (OpSelIdx == -1)
@@ -8168,23 +8169,34 @@ void cvtVOP3DstOpSelOnly(MCInst &Inst) {
 
   unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
 
-  if ((OpSel & (1 << SrcNum)) != 0) {
-    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
-    uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
-    Inst.getOperand(ModIdx).setImm(ModVal | SISrcMods::DST_OP_SEL);
+  int DstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+  if (DstIdx == -1)
+    return;
+
+  const MCOperand &DstOp = Inst.getOperand(DstIdx);
+  int ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+  uint32_t ModVal = Inst.getOperand(ModIdx).getImm();
+  if (DstOp.isReg() &&
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID).contains(DstOp.getReg())) {
+    if (AMDGPU::isHi(DstOp.getReg(), MRI))
+      ModVal |= SISrcMods::DST_OP_SEL;
+  } else {
+    if ((OpSel & (1 << SrcNum)) != 0)
+      ModVal |= SISrcMods::DST_OP_SEL;
   }
+  Inst.getOperand(ModIdx).setImm(ModVal);
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
                                    const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
                                    OptionalImmIndexMap &OptionalIdx) {
   cvtVOP3P(Inst, Operands, OptionalIdx);
-  cvtVOP3DstOpSelOnly(Inst);
+  cvtVOP3DstOpSelOnly(Inst, *getMRI());
 }
 
 static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
@@ -8433,8 +8445,17 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
     uint32_t ModVal = 0;
 
-    if ((OpSel & (1 << J)) != 0)
-      ModVal |= SISrcMods::OP_SEL_0;
+    const MCOperand &SrcOp = Inst.getOperand(OpIdx);
+    if (SrcOp.isReg() && getMRI()
+                             ->getRegClass(AMDGPU::VGPR_16RegClassID)
+                             .contains(SrcOp.getReg())) {
+      bool VGPRSuffixIsHi = AMDGPU::isHi(SrcOp.getReg(), *getMRI());
+      if (VGPRSuffixIsHi)
+        ModVal |= SISrcMods::OP_SEL_0;
+    } else {
+      if ((OpSel & (1 << J)) != 0)
+        ModVal |= SISrcMods::OP_SEL_0;
+    }
 
     if ((OpSelHi & (1 << J)) != 0)
       ModVal |= SISrcMods::OP_SEL_1;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index fba9eb53c8a8b4..2f48b2e3026300 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -913,6 +913,41 @@ static VOPModifiers collectVOPModifiers(const MCInst &MI,
   return Modifiers;
 }
 
+// Instructions decode the op_sel/suffix bits into the src_modifier
+// operands. Copy those bits into the src operands for true16 VGPRs.
+void AMDGPUDisassembler::convertTrue16OpSel(MCInst &MI) const {
+  const unsigned Opc = MI.getOpcode();
+  const MCRegisterClass &ConversionRC =
+      MRI.getRegClass(AMDGPU::VGPR_16RegClassID);
+  constexpr std::array<std::tuple<int, int, unsigned>, 4> OpAndOpMods = {
+      {{AMDGPU::OpName::src0, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src1, AMDGPU::OpName::src1_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::src2, AMDGPU::OpName::src2_modifiers,
+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::DST_OP_SEL}}};
+  for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    int OpModsIdx = AMDGPU::getNamedOperandIdx(Opc, OpModsName);
+    if (OpIdx == -1 || OpModsIdx == -1)
+      continue;
+    MCOperand &Op = MI.getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (!ConversionRC.contains(Op.getReg()))
+      continue;
+    unsigned OpEnc = MRI.getEncodingValue(Op.getReg());
+    const MCOperand &OpMods = MI.getOperand(OpModsIdx);
+    unsigned ModVal = OpMods.getImm();
+    if (ModVal & OpSelMask) { // isHi
+      unsigned RegIdx = OpEnc & AMDGPU::HWEncoding::REG_IDX_MASK;
+      Op.setReg(ConversionRC.getRegister(RegIdx * 2 + 1));
+    }
+  }
+}
+
 // MAC opcodes have special old and src2 operands.
 // src2 is tied to dst, while old is not tied (but assumed to be).
 bool AMDGPUDisassembler::isMacDPP(MCInst &MI) const {
@@ -991,6 +1026,8 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (isMacDPP(MI))
     convertMacDPPInst(MI);
 
+  convertTrue16OpSel(MI);
+
   int VDstInIdx =
       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
   if (VDstInIdx != -1)
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 5a89b30f6fb36a..02feaf553c0c45 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -203,6 +203,7 @@ class AMDGPUDisassembler : public MCDisassembler {
   DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
   DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
   void convertMacDPPInst(MCInst &MI) const;
+  void convertTrue16OpSel(MCInst &MI) const;
 
   enum OpWidthTy {
     OPW32,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index a812cdc61500cc..8bf05682cbe7ea 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -756,14 +756,14 @@ void SIFoldOperands::foldOperand(
   int UseOpIdx,
   SmallVectorImpl<FoldCandidate> &FoldList,
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
-  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
+  const MachineOperand *UseOp = &UseMI->getOperand(UseOpIdx);
 
-  if (!isUseSafeToFold(*UseMI, UseOp))
+  if (!isUseSafeToFold(*UseMI, *UseOp))
     return;
 
   // FIXME: Fold operands with subregs.
-  if (UseOp.isReg() && OpToFold.isReg() &&
-      (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister))
+  if (UseOp->isReg() && OpToFold.isReg() &&
+      (UseOp->isImplicit() || UseOp->getSubReg() != AMDGPU::NoSubRegister))
     return;
 
   // Special case for REG_SEQUENCE: We can't fold literals into
@@ -859,7 +859,6 @@ void SIFoldOperands::foldOperand(
     if (MovOp == AMDGPU::COPY)
       return;
 
-    UseMI->setDesc(TII->get(MovOp));
     MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
     MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
     while (ImpOpI != ImpOpE) {
@@ -867,6 +866,19 @@ void SIFoldOperands::foldOperand(
       ImpOpI++;
       UseMI->removeOperand(UseMI->getOperandNo(Tmp));
     }
+    UseMI->setDesc(TII->get(MovOp));
+
+    if (MovOp == AMDGPU::V_MOV_B16_t16_e64) {
+      const auto &SrcOp = UseMI->getOperand(UseOpIdx);
+      MachineOperand NewSrcOp(SrcOp);
+      MachineFunction *MF = UseMI->getParent()->getParent();
+      UseMI->removeOperand(1);
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // src0_modifiers
+      UseMI->addOperand(NewSrcOp);                          // src0
+      UseMI->addOperand(*MF, MachineOperand::CreateImm(0)); // op_sel
+      UseOpIdx = 2;
+      UseOp = &UseMI->getOperand(UseOpIdx);
+    }
     CopiesToReplace.push_back(UseMI);
   } else {
     if (UseMI->isCopy() && OpToFold.isReg() &&
@@ -1027,7 +1039,7 @@ void SIFoldOperands::foldOperand(
 
     // Don't fold into target independent nodes.  Target independent opcodes
     // don't have defined register classes.
-    if (UseDesc.isVariadic() || UseOp.isImplicit() ||
+    if (UseDesc.isVariadic() || UseOp->isImplicit() ||
         UseDesc.operands()[UseOpIdx].RegClass == -1)
       return;
   }
@@ -1062,17 +1074,17 @@ void SIFoldOperands::foldOperand(
       TRI->getRegClass(FoldDesc.operands()[0].RegClass);
 
   // Split 64-bit constants into 32-bits for folding.
-  if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
-    Register UseReg = UseOp.getReg();
+  if (UseOp->getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
+    Register UseReg = UseOp->getReg();
     const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
     if (AMDGPU::getRegBitWidth(*UseRC) != 64)
       return;
 
     APInt Imm(64, OpToFold.getImm());
-    if (UseOp.getSubReg() == AMDGPU::sub0) {
+    if (UseOp->getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
-      assert(UseOp.getSubReg() == AMDGPU::sub1);
+      assert(UseOp->getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 7edec5a7a5505b..22599773d562cb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1148,7 +1148,13 @@ def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
 
-def FP16VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<16>;
+class FP16VCSrcInputModsMatchClass<bit IsFake16>
+    : FPVCSrcInputModsMatchClass<16> {
+  let Name = !if(IsFake16, "RegOrInlineImmWithFPFake16InputMods",
+                 "RegOrInlineImmWithFPT16InputMods");
+  let PredicateMethod = "isRegOrInlineImmWithFP16InputMods<" #
+                        !if(IsFake16, "true", "false") # ">";
+}
 def FP32VCSrcInputModsMatchClass : FPVCSrcInputModsMatchClass<32>;
 
 class InputMods <AsmOperandClass matchClass> : Operand <i32> {
@@ -1166,7 +1172,8 @@ def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
 def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
 def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
 
-def FP16VCSrcInputMods : FPInputMods<FP16VCSrcInputModsMatchClass>;
+class FP16VCSrcInputMods<bit IsFake16>
+  : FPInputMods<FP16VCSrcInputModsMatchClass<IsFake16>>;
 def FP32VCSrcInputMods : FPInputMods<FP32VCSrcInputModsMatchClass>;
 
 class IntInputModsMatchClass <int opSize> : AsmOperandClass {
@@ -1653,11 +1660,11 @@ class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
 }
 
 // Return type of input modifiers operand for specified input operand for DPP
-class getSrcModVOP3DPP <ValueType VT> {
+class getSrcModVOP3DPP <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
            !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                FP16VCSrcInputMods, FP32VCSrcInputMods),
+                FP16VCSrcInputMods<IsFake16>, FP32VCSrcInputMods),
            Int32VCSrcInputMods);
 }
 
@@ -2450,6 +2457,10 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
 class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let IsTrue16 = 1;
   let IsRealTrue16 = 1;
+
+  let HasOpSel = 1;
+  let HasModifiers = 1; // All instructions at least have OpSel.
+
   // Most DstVT are 16-bit, but not all.
   let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
@@ -2461,6 +2472,10 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
+  let Src0VOP3DPP = VGPRSrc_16;
+  let Src0ModVOP3DPP = getSrcModVOP3DPP<Src0VT, 0 /*IsFake16*/>.ret;
+  let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0 /*IsFake16*/>.ret;
+  let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0 /*IsFake16*/>.ret;
 
   let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index c9dbe02037ef2e..aabb6c29062114 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1235,6 +1235,12 @@ def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
   let EncoderMethod = "getMachineOpValueT16Lo128";
 }
 
+// True 16 operands.
+def VGPRSrc_16 : RegisterOperand<VGPR_16> {
+  let DecoderMethod = "DecodeVGPR_16RegisterClass";
+  let EncoderMethod = "getMachineOpValueT16";
+}
+
 //===----------------------------------------------------------------------===//
 //  ASrc_* Operands with an AccVGPR
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 84da311108ce38..014534ab79fe64 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -50,7 +50,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -88,7 +88,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -127,7 +127,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CEIL_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index 30975a8937db62..dcf9e169f586be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -59,7 +59,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
@@ -97,7 +97,7 @@ body: |
     ; GFX11: liveins: $sgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
     ;
@@ -136,7 +136,7 @@ body: |
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]]
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_FLOOR_F16_t16_e64_]]
     ; GFX11-NEXT: $vgpr0 = COPY [[COPY2]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 7767aa54c81519..9ae5f559e860af 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -66,7 +66,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: ceil_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
@@ -87,7 +87,7 @@ body:             |
     ; REAL16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; REAL16-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
     ; REAL16-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, implicit $mode, implicit $exec
+    ; REAL16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[V_CVT_F32_U32_e64_]].lo16, 0, 0, 0, implicit $mode, implicit $exec
     ;
     ; FAKE16-LABEL: name: floor_f16
     ; FAKE16: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
new file mode 100644
index 00000000000000..1871a41ec5983e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vop1-fake16.s
@@ -0,0 +1,85 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,+wavefrontsize32,-wavefrontsize64 -show-encoding %s | FileCheck --check-prefixes=GFX11 %s
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3]
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_mirror
+// GFX11: [0x05,0x00,0xdc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff]
+
+v_ceil_f16_e64_dpp v5, v1 row_half_mirror
+// GFX11: [0x...
[truncated]

llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt

kosarev · 2024-02-08T11:02:47Z

Added diasm tests on .h operands and added another convertTrue16OpSel() call to fix such operands for DPP8 instructions.

arsenm · 2024-02-08T11:16:35Z

llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp

+        SISrcMods::OP_SEL_0},
+       {AMDGPU::OpName::vdst, AMDGPU::OpName::src0_modifiers,
+        SISrcMods::DST_OP_SEL}}};
+  for (const auto &[OpName, OpModsName, OpSelMask] : OpAndOpMods) {


Don't need the reference

AMDGPUDisassembler.cpp:931:8: note: use reference type 'const std::array<std::tuple<int, int, unsigned int>, 4>::value_type &' (aka 'const std::tuple<int, int, unsigned int> &') to prevent copying for (const auto [OpName, OpModsName, OpSelMask] : OpAndOpMods) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ &

Sisyph

LGTM, thanks!

[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. Also add missing v_ceil/floor_f16 tests. Includes #80892.

kosarev requested review from jayfoad, arsenm, Sisyph and rampitec February 6, 2024 18:34

llvmbot added backend:AMDGPU mc Machine (object) code llvm:globalisel labels Feb 6, 2024

Sisyph reviewed Feb 7, 2024

View reviewed changes

llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vop1.txt Show resolved Hide resolved

[AMDGPU][True16] Support VOP3 source DPP operands.

7d8e535

kosarev force-pushed the t16-dpp branch from 29ec87c to 7d8e535 Compare February 8, 2024 11:00

kosarev requested a review from Sisyph February 8, 2024 11:02

arsenm reviewed Feb 8, 2024

View reviewed changes

Sisyph approved these changes Feb 8, 2024

View reviewed changes

kosarev merged commit 7d19dc5 into llvm:main Feb 8, 2024
4 checks passed

kosarev deleted the t16-dpp branch February 8, 2024 16:23

kosarev mentioned this pull request Feb 8, 2024

[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. #81131

Merged

kosarev added a commit that referenced this pull request Feb 19, 2024

[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. (#81131)

0ec524b

[AMDGPU][MC][True16] Support V_RCP/SQRT/RSQ/LOG/EXP_F16. Also add missing v_ceil/floor_f16 tests. Includes #80892.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU][True16] Support VOP3 source DPP operands. #80892

[AMDGPU][True16] Support VOP3 source DPP operands. #80892

kosarev commented Feb 6, 2024

llvmbot commented Feb 6, 2024 •

edited

Loading

kosarev commented Feb 8, 2024

arsenm Feb 8, 2024

kosarev Feb 8, 2024

Sisyph left a comment

[AMDGPU][True16] Support VOP3 source DPP operands. #80892

[AMDGPU][True16] Support VOP3 source DPP operands. #80892

Conversation

kosarev commented Feb 6, 2024

llvmbot commented Feb 6, 2024 • edited Loading

kosarev commented Feb 8, 2024

arsenm Feb 8, 2024

Choose a reason for hiding this comment

kosarev Feb 8, 2024

Choose a reason for hiding this comment

Sisyph left a comment

Choose a reason for hiding this comment

llvmbot commented Feb 6, 2024 •

edited

Loading