Skip to content

Commit 82380f3

Browse files
authored
[AMDGPU] Prioritize allocation of low 256 VGPR classes (#167978)
If we have 1024 VGPRs available we need to give priority to the allocation of these registers where operands can only use low 256. That is noteably scale operands of V_WMMA_SCALE instructions. Otherwise large tuples will be allocated first and take all low registers, so we would have to spill to get a room for these scale registers. Allocation priority itself does not eliminate spilling completely in large kernels, although helps to some degree. Increasing spill weight of a restricted class on top of it helps.
1 parent 03f4d4d commit 82380f3

File tree

3 files changed

+14
-4
lines changed

3 files changed

+14
-4
lines changed

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,6 +496,17 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
496496

497497
SmallVector<StringLiteral>
498498
getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
499+
500+
float
501+
getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override {
502+
// Prioritize VGPR_32_Lo256 over other classes which may occupy registers
503+
// beyond v256.
504+
return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) *
505+
((RC == &AMDGPU::VGPR_32_Lo256RegClass ||
506+
RC == &AMDGPU::VReg_64_Lo256_Align2RegClass)
507+
? 2.0
508+
: 1.0);
509+
}
499510
};
500511

501512
namespace AMDGPU {

llvm/lib/Target/AMDGPU/SIRegisterInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1
644644
// Identical to VGPR_32 except it only contains the low 256 (Lo256) registers.
645645
def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
646646
(add (sequence "VGPR%u", 0, 255))> {
647-
let AllocationPriority = 0;
647+
let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor));
648648
let GeneratePressureSet = 0;
649649
let Size = 32;
650650
let Weight = 1;

llvm/test/CodeGen/AMDGPU/regalloc-spill-wmma-scale.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s
22

3-
; FIXME: Scale operands of WMMA are limited to low 256 VGPRs
4-
; currently we are spilling it because all low VGPRs are occupied even though our budget is higher.
3+
; Scale operands of WMMA are limited to low 256 VGPRs
54
; Make sure we do not spill scale operands because of the low 256 restriction.
6-
; CHECK: ; ScratchSize: 12
5+
; CHECK: ; ScratchSize: 0
76
; CHECK: ; Occupancy: 1
87

98
define amdgpu_kernel void @spill_scale_test(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, float %arg5, float %arg6, float %arg7, <16 x i32> %arg8, float %arg9, <16 x i32> %arg10, float %arg11, <16 x i8> %arg12) #0 {

0 commit comments

Comments
 (0)