Skip to content

Commit

Permalink
[AMDGPU] Add a regclass flag for scalar registers
Browse files Browse the repository at this point in the history
Along with vector RC flags, this scalar flag will
make various regclass queries like `isVGPR` more
accurate.

Regclasses other than vectors are currently set
with the new flag even though certain unallocatable
classes aren't truly scalars. It would be ok as long
as they remain unallocatable.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D110053
  • Loading branch information
cdevadas committed Dec 2, 2021
1 parent c16b13e commit 399b7de
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 38 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIDefines.h
Expand Up @@ -18,7 +18,8 @@ namespace llvm {
enum SIRCFlags : uint8_t {
// For vector registers.
HasVGPR = 1 << 0,
HasAGPR = 1 << 1
HasAGPR = 1 << 1,
HasSGPR = 1 << 2
}; // enum SIRCFlags

namespace SIInstrFlags {
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Expand Up @@ -1170,7 +1170,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,

unsigned I = MI.getOperandNo(&Op);
if (Desc.OpInfo[I].RegClass == -1 ||
!TRI->isVGPRClass(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
!TRI->isVSSuperClass(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
continue;

if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
Expand Down
18 changes: 14 additions & 4 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Expand Up @@ -172,7 +172,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {

/// \returns true if this class contains only SGPR registers
static bool isSGPRClass(const TargetRegisterClass *RC) {
return !hasVGPRs(RC) && !hasAGPRs(RC);
return hasSGPRs(RC) && !hasVGPRs(RC) && !hasAGPRs(RC);
}

/// \returns true if this class ID contains only SGPR registers
Expand All @@ -184,17 +184,22 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {

/// \returns true if this class contains only VGPR registers
static bool isVGPRClass(const TargetRegisterClass *RC) {
return hasVGPRs(RC) && !hasAGPRs(RC);
return hasVGPRs(RC) && !hasAGPRs(RC) && !hasSGPRs(RC);
}

/// \returns true if this class contains only AGPR registers
static bool isAGPRClass(const TargetRegisterClass *RC) {
return hasAGPRs(RC) && !hasVGPRs(RC);
return hasAGPRs(RC) && !hasVGPRs(RC) && !hasSGPRs(RC);
}

/// \returns true only if this class contains both VGPR and AGPR registers
bool isVectorSuperClass(const TargetRegisterClass *RC) const {
return hasVGPRs(RC) && hasAGPRs(RC);
return hasVGPRs(RC) && hasAGPRs(RC) && !hasSGPRs(RC);
}

/// \returns true only if this class contains both VGPR and SGPR registers
bool isVSSuperClass(const TargetRegisterClass *RC) const {
return hasVGPRs(RC) && hasSGPRs(RC) && !hasAGPRs(RC);
}

/// \returns true if this class contains VGPR registers.
Expand All @@ -207,6 +212,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
return RC->TSFlags & SIRCFlags::HasAGPR;
}

/// \returns true if this class contains SGPR registers.
static bool hasSGPRs(const TargetRegisterClass *RC) {
return RC->TSFlags & SIRCFlags::HasSGPR;
}

/// \returns true if this class contains any vector registers.
static bool hasVectorRegisters(const TargetRegisterClass *RC) {
return hasVGPRs(RC) || hasAGPRs(RC);
Expand Down
90 changes: 58 additions & 32 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Expand Up @@ -133,9 +133,13 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
field bit HasVGPR = 0;
field bit HasAGPR = 0;

// For scalar register classes.
field bit HasSGPR = 0;

// These need to be kept in sync with the enum SIRCFlags.
let TSFlags{0} = HasVGPR;
let TSFlags{1} = HasAGPR;
let TSFlags{2} = HasSGPR;
}

multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
Expand Down Expand Up @@ -307,45 +311,51 @@ foreach Index = 0...255 in {
// Groupings using register classes and tuples
//===----------------------------------------------------------------------===//

def SCC_CLASS : RegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
def SCC_CLASS : SIRegisterClass<"AMDGPU", [i1], 1, (add SCC)> {
let CopyCost = -1;
let isAllocatable = 0;
let HasSGPR = 1;
}

def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
def M0_CLASS : SIRegisterClass<"AMDGPU", [i32], 32, (add M0)> {
let CopyCost = 1;
let isAllocatable = 0;
let HasSGPR = 1;
}

def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
def M0_CLASS_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
let CopyCost = 1;
let Size = 16;
let isAllocatable = 0;
let HasSGPR = 1;
}

// TODO: Do we need to set DwarfRegAlias on register tuples?

def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
def SGPR_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "SGPR%u_LO16", 0, 105))> {
let AllocationPriority = 9;
let Size = 16;
let GeneratePressureSet = 0;
let HasSGPR = 1;
}

def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16,
def SGPR_HI16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "SGPR%u_HI16", 0, 105))> {
let isAllocatable = 0;
let Size = 16;
let GeneratePressureSet = 0;
let HasSGPR = 1;
}

// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
let AllocationPriority = 9;
let GeneratePressureSet = 0;
let HasSGPR = 1;
}

// SGPR 64-bit registers
Expand Down Expand Up @@ -376,16 +386,18 @@ def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s"
def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;

// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
def TTMP_32 : SIRegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
(add (sequence "TTMP%u", 0, 15))> {
let isAllocatable = 0;
let HasSGPR = 1;
}

// Trap handler TMP 16-bit registers
def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
def TTMP_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add (sequence "TTMP%u_LO16", 0, 15))> {
let Size = 16;
let isAllocatable = 0;
let HasSGPR = 1;
}

// Trap handler TMP 64-bit registers
Expand Down Expand Up @@ -598,16 +610,18 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
// Register classes used as source and destination
//===----------------------------------------------------------------------===//

def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add FP_REG, SP_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
let HasSGPR = 1;
}

def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
(add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
let HasSGPR = 1;
}

def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
Expand All @@ -616,18 +630,18 @@ def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
let CopyCost = -1;
}

let GeneratePressureSet = 0 in {
let GeneratePressureSet = 0, HasSGPR = 1 in {
// Subset of SReg_32 without M0 for SMRD instructions and alike.
// See comments in SIInstructions.td for more info.
def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
let AllocationPriority = 10;
}

def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
def SReg_LO16_XM0_XEXEC : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
Expand All @@ -637,95 +651,105 @@ def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
let AllocationPriority = 10;
}

def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XEXEC_HI : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 10;
}

def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16,
def SReg_LO16_XEXEC_HI : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
let Size = 16;
let AllocationPriority = 10;
}

def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32_XM0 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 10;
}

def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16,
def SReg_LO16_XM0 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
let Size = 16;
let AllocationPriority = 10;
}

def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
(add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
let Size = 16;
let AllocationPriority = 10;
}
} // End GeneratePressureSet = 0

// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
def SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
(add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 10;
let HasSGPR = 1;
}

let GeneratePressureSet = 0 in {
def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasSGPR = 1;
}

def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
(add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 11;
let HasSGPR = 1;
}

// CCR (call clobbered registers) SGPR 64-bit registers
def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
(add (trunc SGPR_64, 16))> {
let CopyCost = SGPR_64.CopyCost;
let AllocationPriority = SGPR_64.AllocationPriority;
let HasSGPR = 1;
}

// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
def Gfx_CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
(add (trunc (shl SGPR_64, 15), 1), // s[30:31]
(trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
let CopyCost = SGPR_64.CopyCost;
let AllocationPriority = SGPR_64.AllocationPriority;
let HasSGPR = 1;
}

def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
(add TTMP_64Regs)> {
let isAllocatable = 0;
let HasSGPR = 1;
}

def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
def SReg_64_XEXEC : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 13;
let HasSGPR = 1;
}

def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
def SReg_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
(add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 13;
let HasSGPR = 1;
}

def SReg_1_XEXEC : RegisterClass<"AMDGPU", [i1], 32,
def SReg_1_XEXEC : SIRegisterClass<"AMDGPU", [i1], 32,
(add SReg_64_XEXEC, SReg_32_XM0_XEXEC)> {
let CopyCost = 1;
let isAllocatable = 0;
let HasSGPR = 1;
}

def SReg_1 : RegisterClass<"AMDGPU", [i1], 32,
def SReg_1 : SIRegisterClass<"AMDGPU", [i1], 32,
(add SReg_1_XEXEC, EXEC, EXEC_LO)> {
let CopyCost = 1;
let isAllocatable = 0;
let HasSGPR = 1;
}

multiclass SRegClass<int numRegs, int priority,
Expand All @@ -738,18 +762,18 @@ multiclass SRegClass<int numRegs, int priority,
defvar sgprName = !strconcat("SGPR_", suffix);
defvar ttmpName = !strconcat("TTMP_", suffix);

let AllocationPriority = priority, CopyCost = copyCost in {
def "" # sgprName : RegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
let AllocationPriority = priority, CopyCost = copyCost, HasSGPR = 1 in {
def "" # sgprName : SIRegisterClass<"AMDGPU", regTypes, 32, (add regList)> {
}

if hasTTMP then {
def "" # ttmpName : RegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
def "" # ttmpName : SIRegisterClass<"AMDGPU", regTypes, 32, (add ttmpList)> {
let isAllocatable = 0;
}
}

def SReg_ # suffix :
RegisterClass<"AMDGPU", regTypes, 32,
SIRegisterClass<"AMDGPU", regTypes, 32,
!con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]),
!if(hasTTMP,
!dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
Expand Down Expand Up @@ -855,11 +879,13 @@ def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
}

def VS_64 : SIRegisterClass<"AMDGPU", [i64, f64, v2f32], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
let HasVGPR = 1;
let HasSGPR = 1;
}

def AV_32 : SIRegisterClass<"AMDGPU", VGPR_32.RegTypes, 32, (add VGPR_32, AGPR_32)> {
Expand Down

0 comments on commit 399b7de

Please sign in to comment.