Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][GFX12] VOP encoding and codegen - add support for v_cvt fp8/… #78414

Merged
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/amdgpu-features.cl
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@
// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"

// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"

Expand Down
35 changes: 18 additions & 17 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
Original file line number Diff line number Diff line change
@@ -1,59 +1,60 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s

typedef float v2f __attribute__((ext_vector_type(2)));

// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
// CHECK-LABEL: @test_cvt_f32_bf8
// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
void test_cvt_f32_bf8(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
}

// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
// CHECK-LABEL: @test_cvt_f32_fp8
// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
void test_cvt_f32_fp8(global int* out, int a)
{
*out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
// CHECK-LABEL: @test_cvt_pk_f32_bf8
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
void test_cvt_pk_f32_bf8(global v2f* out, int a)
{
*out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
// CHECK-LABEL: @test_cvt_pk_f32_fp8
// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
void test_cvt_pk_f32_fp8(global v2f* out, int a)
{
*out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
// CHECK-LABEL: @test_cvt_pk_bf8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
{
*out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
}

// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
// CHECK-LABEL: @test_cvt_pk_fp8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
{
*out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
}

// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
// CHECK-LABEL: @test_cvt_sr_bf8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
{
*out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
}

// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
// CHECK-LABEL: @test_cvt_sr_fp8_f32
// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
{
*out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
FeatureFP8ConversionInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
FeatureSALUFloatInsts,
Expand Down
31 changes: 29 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3500,6 +3500,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
return !isInlineConstant(Inst, OpIdx);
} else if (MO.isReg()) {
auto Reg = MO.getReg();
if (!Reg) {
return false;
}
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
auto PReg = mc2PseudoReg(Reg);
return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
Expand Down Expand Up @@ -8303,12 +8306,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;

if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
Inst.addOperand(Inst.getOperand(0));
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
// Adding vdst_in operand is already covered for these DPP instructions in
// cvtVOP3DPP.
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
assert(!IsPacked);
Inst.addOperand(Inst.getOperand(0));
}
Expand Down Expand Up @@ -8770,6 +8781,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
}
}

int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(Inst.getOperand(0));
}

bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
Comment on lines +8789 to +8792
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to hold this up for the release, but I do think this needs to be revisited. We should really avoid having more random lists of opcodes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I will prepare different PRs to cover this and what Joe pointed out.

if (IsVOP3CvtSrDpp) {
if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
Inst.addOperand(MCOperand::createImm(0));
Inst.addOperand(MCOperand::createReg(0));
}
}

auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
MCOI::TIED_TO);
if (TiedTo != -1) {
Expand Down
26 changes: 26 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
AMDGPU::OpName::src2_modifiers);
}

if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
// Insert dummy unused src2_modifiers.
insertNamedMCOperand(MI, MCOperand::createImm(0),
AMDGPU::OpName::src2_modifiers);
}

if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
!AMDGPU::hasGDS(STI)) {
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
Expand Down Expand Up @@ -942,6 +949,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
// first add optional MI operands to check FI
DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
unsigned Opc = MI.getOpcode();

if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
convertVOP3PDPPInst(MI);
} else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
Expand All @@ -951,6 +959,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);

int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
Expand All @@ -977,6 +994,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
if (isMacDPP(MI))
convertMacDPPInst(MI);

int VDstInIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
if (VDstInIdx != -1)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);

if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);

unsigned Opc = MI.getOpcode();
unsigned DescNumOps = MCII->get(Opc).getNumOperands();
if (MI.getNumOperands() < DescNumOps &&
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
const MCSubtargetInfo &STI,
raw_ostream &O) {
unsigned Opc = MI->getOpcode();
if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
auto SrcMod =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
unsigned Mod = MI->getOperand(SrcMod).getImm();
unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
if (Index0 || Index1)
O << " op_sel:[" << Index0 << ',' << Index1 << ']';
return;
}
if (isPermlane16(Opc)) {
auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
Expand Down
7 changes: 5 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1684,8 +1684,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp, omod0:$omod),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0,
clampmod0:$clamp))
!if (HasClamp,
(ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
(ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
/* else */,
// VOP1 without modifiers
!if (HasClamp,
Expand Down Expand Up @@ -2279,6 +2280,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsSingle = 0;
field bit IsWMMA = 0;

field bit IsFP8 = 0;

field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
}

bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
}

bool isGenericAtomic(unsigned Opc) {
return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
LLVM_READNONE
bool isGenericAtomic(unsigned Opc);

LLVM_READNONE
bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);

namespace VOPD {

enum Component : unsigned {
Expand Down
Loading