Skip to content

Commit

Permalink
AMDGPU: Add bf16 vectors to register class definitions (#76214)
Browse files Browse the repository at this point in the history
Assorted intrinsics are currently using i16 in place of a proper
bfloat type, but they should really switch to bfloat.
    
Note this only changes the type lists in tablegen, these are still
not registered to be truly treated as a legal type yet.

Depends #76213
  • Loading branch information
arsenm committed Jan 2, 2024
1 parent 2eb0ac0 commit c1764a7
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 42 deletions.
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1487,8 +1487,18 @@ foreach Index = 0-31 in {
// 16-bit bitcast
def : BitConvert <i16, f16, VGPR_32>;
def : BitConvert <f16, i16, VGPR_32>;
def : BitConvert <f16, bf16, VGPR_32>;
def : BitConvert <bf16, f16, VGPR_32>;

def : BitConvert <i16, f16, SReg_32>;
def : BitConvert <f16, i16, SReg_32>;
def : BitConvert <f16, bf16, SReg_32>;
def : BitConvert <bf16, f16, SReg_32>;

def : BitConvert <i16, bf16, VGPR_32>;
def : BitConvert <bf16, i16, VGPR_32>;
def : BitConvert <i16, bf16, SReg_32>;
def : BitConvert <bf16, i16, SReg_32>;

// 32-bit bitcast
def : BitConvert <i32, f32, VGPR_32>;
Expand Down
62 changes: 31 additions & 31 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
let HasSGPR = 1;
}

def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16, (add M0_LO16)> {
let CopyCost = 1;
let Size = 16;
let isAllocatable = 0;
Expand All @@ -385,15 +385,15 @@ def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {

// TODO: Do we need to set DwarfRegAlias on register tuples?

def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
(add (sequence "SGPR%u_LO16", 0, 105))> {
let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
let HasSGPR = 1;
}

def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
(add (sequence "SGPR%u_HI16", 0, 105))> {
let isAllocatable = 0;
let Size = 16;
Expand All @@ -402,7 +402,7 @@ def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
}

// SGPR 32-bit registers
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
Expand Down Expand Up @@ -451,14 +451,14 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;

// Trap handler TMP 32-bit registers
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16, v2bf16], 32,
(add (sequence "TTMP%u", 0, 15))> {
let isAllocatable = 0;
let HasSGPR = 1;
}

// Trap handler TMP 16-bit registers
def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
(add (sequence "TTMP%u_LO16", 0, 15))> {
let Size = 16;
let isAllocatable = 0;
Expand Down Expand Up @@ -584,8 +584,8 @@ class RegisterTypes<list<ValueType> reg_types> {
list<ValueType> types = reg_types;
}

def Reg16Types : RegisterTypes<[i16, f16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;

let HasVGPR = 1 in {
// VOP3 and VINTERP can access 256 lo and 256 hi registers.
Expand Down Expand Up @@ -683,7 +683,7 @@ def AGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
}

// AccVGPR 32-bit registers
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def AGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 0;
let Size = 32;
Expand Down Expand Up @@ -735,15 +735,15 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
// Register classes used as source and destination
//===----------------------------------------------------------------------===//

def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add FP_REG, SP_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
let HasSGPR = 1;
let BaseClassOrder = 10000;
}

def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
Expand All @@ -760,7 +760,7 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
let GeneratePressureSet = 0, HasSGPR = 1 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, SGPR_NULL_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE_LO,
SRC_SHARED_LIMIT_LO, SRC_PRIVATE_BASE_LO, SRC_PRIVATE_LIMIT_LO, SRC_SHARED_BASE_HI,
Expand All @@ -769,7 +769,7 @@ def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2
let AllocationPriority = 0;
}

def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16, bf16], 16,
(add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, SGPR_NULL_HI_LO16, TTMP_LO16,
TMA_LO_LO16, TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO_LO16,
Expand All @@ -782,39 +782,39 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
let BaseClassOrder = 16;
}

def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
(add SReg_32_XM0_XEXEC, M0_CLASS)> {
let AllocationPriority = 0;
}

def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
(add SReg_32_XEXEC, EXEC_LO)> {
let AllocationPriority = 0;
}

def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 0;
}

} // End GeneratePressureSet = 0

// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16, i1], 32,
(add SReg_32_XM0, M0_CLASS)> {
let AllocationPriority = 0;
let HasSGPR = 1;
let BaseClassOrder = 32;
}

let GeneratePressureSet = 0 in {
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasSGPR = 1;
}

def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 1;
Expand All @@ -836,21 +836,21 @@ def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
let HasSGPR = 1;
}

def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16, v4bf16], 32,
(add TTMP_64Regs)> {
let isAllocatable = 0;
let HasSGPR = 1;
}

def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, SGPR_NULL64, SRC_SHARED_BASE,
SRC_SHARED_LIMIT, SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 1;
let HasSGPR = 1;
}

def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16, v4bf16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 1;
Expand Down Expand Up @@ -905,11 +905,11 @@ multiclass SRegClass<int numRegs,
}

defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], SGPR_256Regs, TTMP_256Regs>;
defm "" : SRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], SGPR_256Regs, TTMP_256Regs>;
defm "" : SRegClass<9, [v9i32, v9f32], SGPR_288Regs, TTMP_288Regs>;
defm "" : SRegClass<10, [v10i32, v10f32], SGPR_320Regs, TTMP_320Regs>;
defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
Expand All @@ -920,7 +920,7 @@ defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}

def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
Expand Down Expand Up @@ -955,15 +955,15 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
}
}

defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
(add VGPR_64)>;
defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;

defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
defm VReg_224 : VRegClass<7, [v7i32, v7f32], (add VGPR_224)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16], (add VGPR_256)>;
defm VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64, v16i16, v16f16, v16bf16], (add VGPR_256)>;
defm VReg_288 : VRegClass<9, [v9i32, v9f32], (add VGPR_288)>;
defm VReg_320 : VRegClass<10, [v10i32, v10f32], (add VGPR_320)>;
defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
Expand Down Expand Up @@ -993,7 +993,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
(add AGPR_64)>;
defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add AGPR_128)>;
defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
Expand Down Expand Up @@ -1032,14 +1032,14 @@ def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
let HasVGPR = 1;
}

def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
}

def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def VS_32_Lo128 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v2f16, v2bf16], 32,
(add VGPR_32_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
Expand Down
22 changes: 11 additions & 11 deletions llvm/lib/Target/AMDGPU/VOP3PInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -190,9 +190,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
// because dealing with the write to high half of the register is
// difficult.
def : GCNPat <
(build_vector f16:$elt0, (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))),
(build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
(v2f16 (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
Expand All @@ -203,9 +203,9 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
def : GCNPat <
(build_vector
f16:$elt0,
(AMDGPUclamp (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
(AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
(v2f16 (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
$src2_modifiers, $src2,
Expand All @@ -215,12 +215,12 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,

def : GCNPat <
(AMDGPUclamp (build_vector
(fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers)))),
(fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers)))))),
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
(v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
$hi_src1_modifiers, $hi_src1,
$hi_src2_modifiers, $hi_src2,
Expand All @@ -243,8 +243,8 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;

def : GCNPat <
(build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
(build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
(v2f16 (mixhi_inst $src0_modifiers, $src0,
$src1_modifiers, $src1,
(i32 0), (i32 0),
Expand Down

0 comments on commit c1764a7

Please sign in to comment.