Skip to content

Commit

Permalink
[AMDGPU] Attempt to reschedule withou clustering
Browse files Browse the repository at this point in the history
We want to have more load/store clustering but we also want
to maintain low register pressure which are oposit targets.
Allow scheduler to reschedule regions without mutations
applied if we hit a register limit.

Differential Revision: https://reviews.llvm.org/D73386
  • Loading branch information
rampitec committed Jan 27, 2020
1 parent 9771122 commit 53eb0f8
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 18 deletions.
69 changes: 51 additions & 18 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Expand Up @@ -316,13 +316,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
ST(MF.getSubtarget<GCNSubtarget>()),
MFI(*MF.getInfo<SIMachineFunctionInfo>()),
StartingOccupancy(MFI.getOccupancy()),
MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {

LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
}

void GCNScheduleDAGMILive::schedule() {
if (Stage == 0) {
if (Stage == Collect) {
// Just record regions at the first pass.
Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
return;
Expand All @@ -348,6 +348,7 @@ void GCNScheduleDAGMILive::schedule() {

ScheduleDAGMILive::schedule();
Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
RescheduleRegions[RegionIdx] = false;

if (!LIS)
return;
Expand Down Expand Up @@ -389,20 +390,28 @@ void GCNScheduleDAGMILive::schedule() {
<< MinOccupancy << ".\n");
}

unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
if (PressureAfter.getVGPRNum() > MaxVGPRs ||
PressureAfter.getSGPRNum() > MaxSGPRs)
RescheduleRegions[RegionIdx] = true;

if (WavesAfter >= MinOccupancy) {
unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
if (WavesAfter > MFI.getMinWavesPerEU() ||
if (Stage == UnclusteredReschedule &&
!PressureAfter.less(ST, PressureBefore)) {
LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
} else if (WavesAfter > MFI.getMinWavesPerEU() ||
PressureAfter.less(ST, PressureBefore) ||
(TotalVGPRs >= PressureAfter.getVGPRNum() &&
TotalSGPRs >= PressureAfter.getSGPRNum())) {
!RescheduleRegions[RegionIdx]) {
Pressure[RegionIdx] = PressureAfter;
return;
} else {
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}
LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
}

LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RescheduleRegions[RegionIdx] = true;
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->isDebugInstr())
Expand Down Expand Up @@ -532,41 +541,63 @@ void GCNScheduleDAGMILive::finalizeSchedule() {

LiveIns.resize(Regions.size());
Pressure.resize(Regions.size());
RescheduleRegions.resize(Regions.size());
RescheduleRegions.set();

if (!Regions.empty())
BBLiveInMap = getBBLiveInMap();

std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;

do {
Stage++;
RegionIdx = 0;
MachineBasicBlock *MBB = nullptr;

if (Stage > 1) {
if (Stage > InitialSchedule) {
if (!LIS)
break;

// Retry function scheduling if we found resulting occupancy and it is
// lower than used for first pass scheduling. This will give more freedom
// to schedule low register pressure blocks.
// Code is partially copied from MachineSchedulerBase::scheduleRegions().

if (!LIS || StartingOccupancy <= MinOccupancy)
break;
if (Stage == UnclusteredReschedule) {
if (RescheduleRegions.none())
continue;
LLVM_DEBUG(dbgs() <<
"Retrying function scheduling without clustering.\n");
}

if (Stage == ClusteredLowOccupancyReschedule) {
if (StartingOccupancy <= MinOccupancy)
break;

LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");
LLVM_DEBUG(
dbgs()
<< "Retrying function scheduling with lowest recorded occupancy "
<< MinOccupancy << ".\n");

S.setTargetOccupancy(MinOccupancy);
S.setTargetOccupancy(MinOccupancy);
}
}

if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);

for (auto Region : Regions) {
if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
continue;

RegionBegin = Region.first;
RegionEnd = Region.second;

if (RegionBegin->getParent() != MBB) {
if (MBB) finishBlock();
MBB = RegionBegin->getParent();
startBlock(MBB);
if (Stage == 1)
if (Stage == InitialSchedule)
computeBlockPressure(MBB);
}

Expand Down Expand Up @@ -594,5 +625,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
}
finishBlock();

} while (Stage < 2);
if (Stage == UnclusteredReschedule)
SavedMutations.swap(Mutations);
} while (Stage != LastStage);
}
12 changes: 12 additions & 0 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Expand Up @@ -64,6 +64,14 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {

class GCNScheduleDAGMILive final : public ScheduleDAGMILive {

enum : unsigned {
Collect,
InitialSchedule,
UnclusteredReschedule,
ClusteredLowOccupancyReschedule,
LastStage = ClusteredLowOccupancyReschedule
};

const GCNSubtarget &ST;

SIMachineFunctionInfo &MFI;
Expand All @@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
SmallVector<std::pair<MachineBasicBlock::iterator,
MachineBasicBlock::iterator>, 32> Regions;

// Records if a region is not yet scheduled, or schedule has been reverted,
// or we generally desire to reschedule it.
BitVector RescheduleRegions;

// Region live-in cache.
SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;

Expand Down
36 changes: 36 additions & 0 deletions llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll
@@ -0,0 +1,36 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

; Interleave loads and stores to fit into 9 VGPR limit.
; This requires to avoid load/store clustering.

; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: global_load_dwordx4
; GCN: global_store_dwordx4
; GCN: NumVgprs: {{[0-9]$}}
; GCN: ScratchSize: 0{{$}}

define amdgpu_kernel void @load_store_max_9vgprs(<4 x i32> addrspace(1)* nocapture noalias readonly %arg, <4 x i32> addrspace(1)* nocapture noalias %arg1) #1 {
bb:
%id = call i32 @llvm.amdgcn.workitem.id.x()
%base = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i32 %id
%tmp = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 1
%tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp, align 4
%tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 3
%tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4
%tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %base, i32 5
%tmp6 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp5, align 4
store <4 x i32> %tmp2, <4 x i32> addrspace(1)* %arg1, align 4
%tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 3
store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp7, align 4
%tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 5
store <4 x i32> %tmp6, <4 x i32> addrspace(1)* %tmp8, align 4
ret void
}

declare i32 @llvm.amdgcn.workitem.id.x() #0

attributes #0 = { nounwind readnone }
attributes #1 = { "amdgpu-num-vgpr"="9" }

0 comments on commit 53eb0f8

Please sign in to comment.