Skip to content

Commit

Permalink
[AMDGPU] Define 16 bit VGPR subregs
Browse files Browse the repository at this point in the history
We have loads preserving low and high 16 bits of their
destinations. However, we always use a whole 32 bit register
for these. The same happens with 16 bit stores, we have to
use full 32 bit register so if high bits are clobbered the
register needs to be copied. One example of such code is
added to the load-hi16.ll.

The proper solution to the problem is to define 16 bit subregs
and use them in the operations which do not read another half
of a VGPR or preserve it if the VGPR is written.

This patch simply defines subregisters and register classes.
At the moment there should be no difference in code generation.
A lot more work is needed to actually use these new register
classes. Therefore, there are no new tests at this time.

Register weight calculation has changed with new subregs so
appropriate changes were made to keep all calculations just
as they are now, especially calculations of register pressure.

Differential Revision: https://reviews.llvm.org/D74873
  • Loading branch information
rampitec committed Mar 31, 2020
1 parent 28518d9 commit 08682dc
Show file tree
Hide file tree
Showing 10 changed files with 189 additions and 62 deletions.
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
Expand Up @@ -11,7 +11,7 @@ def SGPRRegBank : RegisterBank<"SGPR",
>;

def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
[VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
>;

// It is helpful to distinguish conditions from ordinary SGPRs.
Expand Down
20 changes: 18 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Expand Up @@ -40,7 +40,20 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(

SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {}
SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {

assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
(getSubRegIndexLaneMask(AMDGPU::lo16) |
getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
"getNumCoveredRegs() will not work with generated subreg masks!");

RegPressureIgnoredUnits.resize(getNumRegUnits());
RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
for (auto Reg : AMDGPU::VGPR_HI16RegClass)
RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
}

void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
unsigned Reg) const {
Expand Down Expand Up @@ -1777,6 +1790,8 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
default:
return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
case AMDGPU::VGPR_32RegClassID:
case AMDGPU::VGPR_LO16RegClassID:
case AMDGPU::VGPR_HI16RegClassID:
return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
case AMDGPU::SGPR_32RegClassID:
return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
Expand All @@ -1800,8 +1815,9 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
static const int Empty[] = { -1 };

if (hasRegUnit(AMDGPU::M0, RegUnit))
if (RegPressureIgnoredUnits[RegUnit])
return Empty;

return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
}

Expand Down
9 changes: 8 additions & 1 deletion llvm/lib/Target/AMDGPU/SIRegisterInfo.h
Expand Up @@ -32,6 +32,7 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
const GCNSubtarget &ST;
bool SpillSGPRToVGPR;
bool isWave32;
BitVector RegPressureIgnoredUnits;

void reserveRegisterTuples(BitVector &, unsigned Reg) const;

Expand Down Expand Up @@ -269,7 +270,13 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {

// \returns number of 32 bit registers covered by a \p LM
static unsigned getNumCoveredRegs(LaneBitmask LM) {
return LM.getNumLanes();
// The assumption is that every lo16 subreg is an even bit and every hi16
// is an adjacent odd bit or vice versa.
uint64_t Mask = LM.getAsInteger();
uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;
Mask = (Even >> 1) | Mask;
uint64_t Odd = Mask & 0x5555555555555555ULL;
return countPopulation(Odd);
}

// \returns a DWORD offset of a \p SubReg
Expand Down
90 changes: 87 additions & 3 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Expand Up @@ -10,10 +10,44 @@
// Subregister declarations
//===----------------------------------------------------------------------===//

class Indexes<int N> {
list<int> all = [0, 1, 2, 3, 4, 5, 6 , 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31];

// Returns list of indexes [0..N)
list<int> slice =
!foldl([]<int>, all, acc, cur,
!listconcat(acc, !if(!lt(cur, N), [cur], [])));
}

let Namespace = "AMDGPU" in {

def lo16 : SubRegIndex<16, 0>;
def hi16 : SubRegIndex<16, 16>;

foreach Index = 0-31 in {
def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
}

foreach Index = 1-31 in {
def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
}

foreach Size = {2-5,8,16} in {
foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
def !foldl("", Indexes<Size>.slice, acc, cur,
!strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
let CoveringSubRegIndices =
!foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
!listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
}
}
}

}

//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -92,6 +126,16 @@ class SIReg <string n, bits<16> regIdx = 0> :
let HWEncoding = regIdx;
}

class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx = 0> :
RegisterWithSubRegs<n, subregs> {
let Namespace = "AMDGPU";

// This is the not yet the complete register encoding. An additional
// bit is set for VGPRs.
let HWEncoding = regIdx;
let CoveredBySubRegs = 1;
}

// Special Registers
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
Expand Down Expand Up @@ -221,10 +265,29 @@ foreach Index = 0-255 in {
// The ratio of index/allocation_granularity is taken as the cost value.
// Considered the allocation granularity as 4 here.
let CostPerUse=!if(!gt(Index, 31), !srl(Index, 2), 0) in {
def VGPR#Index :
SIReg <"v"#Index, Index>,
DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {

// There is no special encoding for low 16 bit subreg, this not a real
// register but rather an operand for instructions preserving high 16 bits
// of the result or reading just low 16 bits of a 32 bit VGPR.
// It is encoded as a corresponding 32 bit register.
def VGPR#Index#_LO16 : SIReg <"v"#Index#".l", Index>,
DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
let HWEncoding{8} = 1;
}
// There is no special encoding for low 16 bit subreg, this not a real
// register but rather an operand for instructions preserving low 16 bits
// of the result or reading just high 16 bits of a 32 bit VGPR.
// It is encoded as a corresponding 32 bit register.
def VGPR#Index#_HI16 : SIReg <"v"#Index#".h", Index>,
DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
let HWEncoding{8} = 1;
}
def VGPR#Index : SIRegWithSubRegs <"v"#Index,
[!cast<Register>("VGPR"#Index#"_LO16"), !cast<Register>("VGPR"#Index#"_HI16")],
Index>,
DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> {
let HWEncoding{8} = 1;
let SubRegIndices = [lo16, hi16];
}
}
}
Expand Down Expand Up @@ -386,13 +449,27 @@ class RegisterTypes<list<ValueType> reg_types> {
def Reg16Types : RegisterTypes<[i16, f16]>;
def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;

def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (sequence "VGPR%u_LO16", 0, 255))> {
let AllocationPriority = 1;
let Size = 16;
let GeneratePressureSet = 0;
}

def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
(add (sequence "VGPR%u_HI16", 0, 255))> {
let AllocationPriority = 1;
let Size = 16;
let GeneratePressureSet = 0;
}

// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
(add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
let Weight = 1;
}

// VGPR 64-bit registers
Expand Down Expand Up @@ -634,6 +711,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0,
// Requires 2 v_mov_b32 to copy
let CopyCost = 2;
let AllocationPriority = 2;
let Weight = 2;
}

def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
Expand All @@ -642,6 +720,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
// Requires 3 v_mov_b32 to copy
let CopyCost = 3;
let AllocationPriority = 3;
let Weight = 3;
}

def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64, i128], 32,
Expand All @@ -651,6 +730,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64, i128], 32,
// Requires 4 v_mov_b32 to copy
let CopyCost = 4;
let AllocationPriority = 4;
let Weight = 4;
}

def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
Expand All @@ -660,27 +740,31 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
// Requires 5 v_mov_b32 to copy
let CopyCost = 5;
let AllocationPriority = 5;
let Weight = 5;
}

def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
(add VGPR_256)> {
let Size = 256;
let CopyCost = 8;
let AllocationPriority = 6;
let Weight = 8;
}

def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
(add VGPR_512)> {
let Size = 512;
let CopyCost = 16;
let AllocationPriority = 7;
let Weight = 16;
}

def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
(add VGPR_1024)> {
let Size = 1024;
let CopyCost = 32;
let AllocationPriority = 8;
let Weight = 32;
}

def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
Expand Down

0 comments on commit 08682dc

Please sign in to comment.