From d9790e0e745c0c1e4be9c4b93e042b7af008d85c Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 13 Nov 2025 15:24:52 -0800 Subject: [PATCH] [AMDGPU] Prioritize allocation of low 256 VGPR classes If we have 1024 VGPRs available we need to give priority to the allocation of these registers where operands can only use low 256. That is noteably scale operands of V_WMMA_SCALE instructions. Otherwise large tuples will be allocated first and take all low registers, so we would have to spill to get a room for these scale registers. Allocation priority itself does not eliminate spilling completely in large kernels, although helps to some degree. Increasing spill weight of a restricted class on top of it helps. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 11 +++++++++++ llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index a6af25dfd7d6f..28ab2137b193c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -501,6 +501,17 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { SmallVector getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override; + + float + getSpillWeightScaleFactor(const TargetRegisterClass *RC) const override { + // Prioritize VGPR_32_Lo256 over other classes which may occupy registers + // beyond v256. + return AMDGPUGenRegisterInfo::getSpillWeightScaleFactor(RC) * + ((RC == &AMDGPU::VGPR_32_Lo256RegClass || + RC == &AMDGPU::VReg_64_Lo256_Align2RegClass) + ? 2.0 + : 1.0); + } }; namespace AMDGPU { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index abe12c17ae76c..5cff5f2248b02 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -644,7 +644,7 @@ def VGPR_32_Lo128 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg1 // Identical to VGPR_32 except it only contains the low 256 (Lo256) registers. def VGPR_32_Lo256 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32, (add (sequence "VGPR%u", 0, 255))> { - let AllocationPriority = 0; + let AllocationPriority = !add(3, !mul(BaseClassPriority, BaseClassScaleFactor)); let GeneratePressureSet = 0; let Size = 32; let Weight = 1;