-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU] Verify dominance when rewriting spills to registers #167347
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
699a0fa to
16c1de0
Compare
|
I'm still working on creating a smaller testcase. It's hard to reproduce. |
|
@llvm/pr-subscribers-backend-amdgpu Author: Austin Kerbow (kerbowa) ChangesWhen performing spill elimination in the AGPR copy rewrite pass it was possible to see spill reloads that were not dominated by any store. This caused invalid MIR to be generated where vreg uses were not dominated by defs. This patch adds a dominance check before rewriting spills. Patch is 172.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167347.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
index 89c16dadb4b41..d2c56589acafd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/VirtRegMap.h"
@@ -58,6 +59,7 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
LiveIntervals &LIS;
LiveStacks &LSS;
const RegisterClassInfo &RegClassInfo;
+ MachineDominatorTree &MDT;
bool attemptReassignmentsToAGPR(SmallSetVector<Register, 4> &InterferingRegs,
MCPhysReg PrefPhysReg) const;
@@ -66,10 +68,11 @@ class AMDGPURewriteAGPRCopyMFMAImpl {
AMDGPURewriteAGPRCopyMFMAImpl(MachineFunction &MF, VirtRegMap &VRM,
LiveRegMatrix &LRM, LiveIntervals &LIS,
LiveStacks &LSS,
- const RegisterClassInfo &RegClassInfo)
+ const RegisterClassInfo &RegClassInfo,
+ MachineDominatorTree &MDT)
: MF(MF), ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
TRI(*ST.getRegisterInfo()), MRI(MF.getRegInfo()), VRM(VRM), LRM(LRM),
- LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo) {}
+ LIS(LIS), LSS(LSS), RegClassInfo(RegClassInfo), MDT(MDT) {}
bool isRewriteCandidate(const MachineInstr &MI) const {
return TII.isMAI(MI) && AMDGPU::getMFMASrcCVDstAGPROp(MI.getOpcode()) != -1;
@@ -515,6 +518,56 @@ void AMDGPURewriteAGPRCopyMFMAImpl::eliminateSpillsOfReassignedVGPRs() const {
if (SpillReferences == SpillSlotReferences.end())
continue;
+ // Verify that each spill restore is dominated by at least one spill save.
+ SmallVector<MachineInstr *, 4> Stores, Loads;
+ Stores.reserve(SpillReferences->second.size());
+ Loads.reserve(SpillReferences->second.size());
+ for (MachineInstr *MI : SpillReferences->second) {
+ if (MI->mayStore())
+ Stores.push_back(MI);
+ else if (MI->mayLoad())
+ Loads.push_back(MI);
+ }
+
+ SmallVector<MachineInstr *, 4> ReachableStores;
+ ReachableStores.reserve(Stores.size());
+ for (MachineInstr *S : Stores)
+ if (MDT.isReachableFromEntry(S->getParent()))
+ ReachableStores.push_back(S);
+
+ if (ReachableStores.empty()) {
+ LLVM_DEBUG(dbgs() << "Skipping " << printReg(Slot, &TRI)
+ << ": no reachable stores\n");
+ continue;
+ }
+
+ auto IsDominatedByAnyStore = [&](MachineInstr *LoadMI) -> bool {
+ MachineBasicBlock *LoadMBB = LoadMI->getParent();
+ if (!MDT.isReachableFromEntry(LoadMBB))
+ return true;
+ for (MachineInstr *StoreMI : ReachableStores) {
+ MachineBasicBlock *StoreMBB = StoreMI->getParent();
+ if (StoreMBB == LoadMBB) {
+ for (MachineBasicBlock::iterator I = StoreMI->getIterator(),
+ E = StoreMBB->end();
+ I != E; ++I)
+ if (&*I == LoadMI)
+ return true;
+ continue;
+ }
+ if (MDT.dominates(StoreMBB, LoadMBB))
+ return true;
+ }
+ return false;
+ };
+
+ if (!llvm::all_of(Loads, IsDominatedByAnyStore)) {
+ LLVM_DEBUG(
+ dbgs() << "Skipping " << printReg(Slot, &TRI)
+ << ": some reachable load not dominated by any store\n");
+ continue;
+ }
+
const TargetRegisterClass *RC = LSS.getIntervalRegClass(Slot);
LLVM_DEBUG(dbgs() << "Trying to eliminate " << printReg(Slot, &TRI)
@@ -603,11 +656,13 @@ class AMDGPURewriteAGPRCopyMFMALegacy : public MachineFunctionPass {
AU.addRequired<VirtRegMapWrapperLegacy>();
AU.addRequired<LiveRegMatrixWrapperLegacy>();
AU.addRequired<LiveStacksWrapperLegacy>();
+ AU.addRequired<MachineDominatorTreeWrapperPass>();
AU.addPreserved<LiveIntervalsWrapperPass>();
AU.addPreserved<VirtRegMapWrapperLegacy>();
AU.addPreserved<LiveRegMatrixWrapperLegacy>();
AU.addPreserved<LiveStacksWrapperLegacy>();
+ AU.addPreserved<MachineDominatorTreeWrapperPass>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
@@ -622,6 +677,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
INITIALIZE_PASS_DEPENDENCY(LiveStacksWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
INITIALIZE_PASS_END(AMDGPURewriteAGPRCopyMFMALegacy, DEBUG_TYPE,
"AMDGPU Rewrite AGPR-Copy-MFMA", false, false)
@@ -641,7 +697,8 @@ bool AMDGPURewriteAGPRCopyMFMALegacy::runOnMachineFunction(
auto &LRM = getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
auto &LIS = getAnalysis<LiveIntervalsWrapperPass>().getLIS();
auto &LSS = getAnalysis<LiveStacksWrapperLegacy>().getLS();
- AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+ auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+ AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
return Impl.run(MF);
}
@@ -652,10 +709,11 @@ AMDGPURewriteAGPRCopyMFMAPass::run(MachineFunction &MF,
LiveRegMatrix &LRM = MFAM.getResult<LiveRegMatrixAnalysis>(MF);
LiveIntervals &LIS = MFAM.getResult<LiveIntervalsAnalysis>(MF);
LiveStacks &LSS = MFAM.getResult<LiveStacksAnalysis>(MF);
+ MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
RegisterClassInfo RegClassInfo;
RegClassInfo.runOnMachineFunction(MF);
- AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo);
+ AMDGPURewriteAGPRCopyMFMAImpl Impl(MF, VRM, LRM, LIS, LSS, RegClassInfo, MDT);
if (!Impl.run(MF))
return PreservedAnalyses::all();
auto PA = getMachineFunctionPassPreservedAnalyses();
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-agpr-spill-multi-store-crash.ll b/llvm/test/CodeGen/AMDGPU/rewrite-agpr-spill-multi-store-crash.ll
new file mode 100644
index 0000000000000..070032e03624a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-agpr-spill-multi-store-crash.ll
@@ -0,0 +1,2084 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -O3 -o - %s | FileCheck %s
+
+; This is a reduced testcase that demonstrates a bug fix in AMDGPURewriteAGPRCopyMFMA's
+; eliminateSpillsOfReassignedVGPRs(). When a spill slot has multiple stores in
+; different blocks that don't dominate all loads, the pass must skip elimination
+; to avoid creating invalid SSA by replacing all references with a single vreg.
+
+; CHECK-LABEL: kernel_crash:
+
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+@global_smem = external addrspace(3) global [0 x i8]
+
+define amdgpu_kernel void @kernel_crash(i32 inreg %0, ptr addrspace(1) inreg readonly captures(none) %1, i32 inreg %2, i32 %3, i32 %4, i32 %5, i32 %6, ptr addrspace(3) %7, ptr addrspace(3) %global_smem, i1 %8, i32 %9, i32 %10, i32 %11, i32 %12, <8 x half> %13, <8 x half> %14, <8 x half> %15, <8 x half> %16, <8 x half> %17, <8 x half> %18, <8 x half> %19, <8 x half> %20, <8 x half> %21, <8 x half> %22, <8 x half> %23, <8 x half> %24, <8 x half> %25, <8 x half> %26, float %27, ptr addrspace(3) %28, ptr addrspace(3) %29, ptr addrspace(3) %30, ptr addrspace(3) %31, ptr addrspace(3) %32, ptr addrspace(3) %33, ptr addrspace(3) %34, ptr addrspace(3) %35, ptr addrspace(3) %36, ptr addrspace(3) %37, ptr addrspace(3) %38, i1 %exitcond.not, <2 x float> %39, <2 x float> %40, ptr addrspace(3) %41, ptr addrspace(3) %42, ptr addrspace(3) %43, ptr addrspace(3) %44, ptr addrspace(3) %45, ptr addrspace(3) %46, ptr addrspace(3) %47, ptr addrspace(3) %48, ptr addrspace(3) %49, ptr addrspace(3) %50, ptr addrspace(3) %51, ptr addrspace(3) %52, ptr addrspace(3) %53, ptr addrspace(3) %54, ptr addrspace(3) %55, ptr addrspace(3) %56, ptr addrspace(3) %57, ptr addrspace(3) %58, ptr addrspace(3) %59, ptr addrspace(3) %60, i1 %61, <8 x half> %62, <8 x half> %63, <4 x float> %64, <4 x float> %65, <2 x float> %66, <2 x float> %67, i32 %68, <2 x float> %69, <2 x float> %70, <2 x float> %71, <2 x float> %72, <2 x float> %73, <2 x float> %74) #0 {
+ %76 = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %77 = and i32 %76, 127
+ %78 = srem i32 %77, %3
+ %79 = or disjoint i32 %77, 1
+ %80 = srem i32 %77, %2
+ %81 = srem i32 %79, %2
+ %82 = add i32 %80, %4
+ %83 = add i32 %80, %5
+ %84 = add i32 %81, %5
+ %85 = add i32 %81, %6
+ %86 = shl i32 %0, 0
+ %87 = shl nuw nsw i32 %77, 1
+ %88 = getelementptr inbounds nuw i8, ptr addrspace(3) %7, i32 %87
+ store i16 0, ptr addrspace(3) %88, align 2
+ %89 = getelementptr inbounds nuw i8, ptr addrspace(3) %global_smem, i32 %87
+ %90 = xor i32 %87, 1040
+ %91 = getelementptr inbounds nuw i8, ptr addrspace(3) %global_smem, i32 %90
+ %92 = getelementptr inbounds nuw i8, ptr addrspace(3) %91, i32 33024
+ store i16 0, ptr addrspace(3) %92, align 2
+ %93 = getelementptr inbounds nuw i8, ptr addrspace(3) %91, i32 49664
+ %94 = getelementptr inbounds nuw i8, ptr addrspace(3) %91, i32 49920
+ %95 = xor i32 %87, 13520
+ %96 = getelementptr inbounds nuw i8, ptr addrspace(3) %global_smem, i32 %95
+ %97 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 16384
+ store i16 0, ptr addrspace(3) %97, align 2
+ %98 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 16896
+ %99 = getelementptr inbounds nuw i8, ptr addrspace(3) %96, i32 32768
+ br i1 %8, label %.lr.ph, label %.._crit_edge_crit_edge
+
+.._crit_edge_crit_edge: ; preds = %75
+ %.pre = and i32 %76, 1
+ br label %._crit_edge
+
+.lr.ph: ; preds = %75
+ %100 = and i32 %76, 152
+ %101 = xor i32 %100, 0
+ %102 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %9
+ %103 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4160
+ %104 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 32768
+ %105 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 36928
+ %106 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 64
+ %107 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4096
+ %108 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 256
+ %109 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4416
+ %110 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33024
+ %111 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37184
+ %112 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 320
+ %113 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4352
+ %114 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33088
+ %115 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37120
+ %116 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 512
+ %117 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4672
+ %118 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33280
+ %119 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37440
+ %120 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 576
+ %121 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4608
+ %122 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33344
+ %123 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37376
+ %124 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 768
+ %125 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4928
+ %126 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33536
+ %127 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37696
+ %128 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 832
+ %129 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 4864
+ %130 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 33600
+ %131 = getelementptr inbounds nuw i8, ptr addrspace(3) %102, i32 37632
+ %132 = getelementptr inbounds nuw i8, ptr addrspace(3) @global_smem, i32 %10
+ %133 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 32768
+ %134 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 36928
+ %135 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 64
+ %136 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 4096
+ %137 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 32832
+ %138 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 36864
+ %139 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 33088
+ %140 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 37120
+ %141 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 512
+ %142 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 37440
+ %143 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 576
+ %144 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 4608
+ %145 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 33344
+ %146 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 37376
+ %147 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 768
+ %148 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 4928
+ %149 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 33536
+ %150 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 37696
+ %151 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 832
+ %152 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 4864
+ %153 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 33600
+ %154 = getelementptr inbounds nuw i8, ptr addrspace(3) %132, i32 37632
+ %155 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %9
+ %156 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 1088
+ %157 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 8192
+ %158 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 9280
+ %159 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 64
+ %160 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 1024
+ %161 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 8256
+ %162 = getelementptr inbounds nuw i8, ptr addrspace(3) %155, i32 9216
+ %163 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %11
+ %164 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 1088
+ %165 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 8192
+ %166 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 9280
+ %167 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 64
+ %168 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 1024
+ %169 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 8256
+ %170 = getelementptr inbounds nuw i8, ptr addrspace(3) %163, i32 9216
+ %171 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %10
+ %172 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 1088
+ %173 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 8192
+ %174 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 9280
+ %175 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 64
+ %176 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 1024
+ %177 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 8256
+ %178 = getelementptr inbounds nuw i8, ptr addrspace(3) %171, i32 9216
+ %179 = getelementptr inbounds nuw i8, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @global_smem, i32 65536), i32 %101
+ %180 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 1088
+ %181 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 8192
+ %182 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 9280
+ %183 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 64
+ %184 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 1024
+ %185 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 8256
+ %186 = getelementptr inbounds nuw i8, ptr addrspace(3) %179, i32 9216
+ br label %187
+
+187: ; preds = %187, %.lr.ph
+ %.pn5751951 = phi i32 [ %78, %.lr.ph ], [ %343, %187 ]
+ %188 = phi float [ 0.000000e+00, %.lr.ph ], [ %560, %187 ]
+ %189 = phi float [ 0.000000e+00, %.lr.ph ], [ %561, %187 ]
+ %190 = phi float [ 0.000000e+00, %.lr.ph ], [ %562, %187 ]
+ %191 = phi float [ 0.000000e+00, %.lr.ph ], [ %569, %187 ]
+ %192 = phi float [ 0.000000e+00, %.lr.ph ], [ %570, %187 ]
+ %193 = phi float [ 0.000000e+00, %.lr.ph ], [ %571, %187 ]
+ %194 = phi float [ 0.000000e+00, %.lr.ph ], [ %572, %187 ]
+ %195 = phi float [ 0.000000e+00, %.lr.ph ], [ %579, %187 ]
+ %196 = phi float [ 0.000000e+00, %.lr.ph ], [ %580, %187 ]
+ %197 = phi float [ 0.000000e+00, %.lr.ph ], [ %581, %187 ]
+ %198 = phi float [ 0.000000e+00, %.lr.ph ], [ %585, %187 ]
+ %199 = phi float [ 0.000000e+00, %.lr.ph ], [ %590, %187 ]
+ %200 = phi float [ 0.000000e+00, %.lr.ph ], [ %591, %187 ]
+ %201 = phi float [ 0.000000e+00, %.lr.ph ], [ %598, %187 ]
+ %202 = phi float [ 0.000000e+00, %.lr.ph ], [ %599, %187 ]
+ %203 = phi float [ 0.000000e+00, %.lr.ph ], [ %600, %187 ]
+ %204 = phi float [ 0.000000e+00, %.lr.ph ], [ %601, %187 ]
+ %205 = phi float [ 0.000000e+00, %.lr.ph ], [ %606, %187 ]
+ %206 = phi float [ 0.000000e+00, %.lr.ph ], [ %607, %187 ]
+ %207 = phi float [ 0.000000e+00, %.lr.ph ], [ %614, %187 ]
+ %208 = phi float [ 0.000000e+00, %.lr.ph ], [ %615, %187 ]
+ %209 = phi float [ 0.000000e+00, %.lr.ph ], [ %616, %187 ]
+ %210 = phi float [ 0.000000e+00, %.lr.ph ], [ %622, %187 ]
+ %211 = phi float [ 0.000000e+00, %.lr.ph ], [ %623, %187 ]
+ %212 = phi float [ 0.000000e+00, %.lr.ph ], [ %630, %187 ]
+ %213 = phi float [ 0.000000e+00, %.lr.ph ], [ %631, %187 ]
+ %214 = phi float [ 0.000000e+00, %.lr.ph ], [ %632, %187 ]
+ %215 = phi float [ 0.000000e+00, %.lr.ph ], [ %633, %187 ]
+ %216 = phi float [ 0.000000e+00, %.lr.ph ], [ %637, %187 ]
+ %217 = phi float [ 0.000000e+00, %.lr.ph ], [ %644, %187 ]
+ %218 = phi float [ 0.000000e+00, %.lr.ph ], [ %645, %187 ]
+ %219 = phi float [ 0.000000e+00, %.lr.ph ], [ %646, %187 ]
+ %220 = phi float [ 0.000000e+00, %.lr.ph ], [ %647, %187 ]
+ %221 = phi float [ 0.000000e+00, %.lr.ph ], [ %654, %187 ]
+ %222 = phi float [ 0.000000e+00, %.lr.ph ], [ %655, %187 ]
+ %223 = phi float [ 0.000000e+00, %.lr.ph ], [ %656, %187 ]
+ %224 = phi float [ 0.000000e+00, %.lr.ph ], [ %657, %187 ]
+ %225 = phi float [ 0.000000e+00, %.lr.ph ], [ %664, %187 ]
+ %226 = phi float [ 0.000000e+00, %.lr.ph ], [ %665, %187 ]
+ %227 = phi float [ 0.000000e+00, %.lr.ph ], [ %666, %187 ]
+ %228 = phi float [ 0.000000e+00, %.lr.ph ], [ %667, %187 ]
+ %229 = phi float [ 0.000000e+00, %.lr.ph ], [ %674, %187 ]
+ %230 = phi float [ 0.000000e+00, %.lr.ph ], [ %675, %187 ]
+ %231 = phi float [ 0.000000e+00, %.lr.ph ], [ %676, %187 ]
+ %232 = phi float [ 0.000000e+00, %.lr.ph ], [ %677, %187 ]
+ %233 = phi float [ 0.000000e+00, %.lr.ph ], [ %684, %187 ]
+ %234 = phi float [ 0.000000e+00, %.lr.ph ], [ %685, %187 ]
+ %235 = phi float [ 0.000000e+00, %.lr.ph ], [ %686, %187 ]
+ %236 = phi float [ 0.000000e+00, %.lr.ph ], [ %691, %187 ]
+ %237 = phi float [ 0.000000e+00, %.lr.ph ], [ %692, %187 ]
+ %238 = phi float [ 0.000000e+00, %.lr.ph ], [ %696, %187 ]
+ %239 = phi float [ 0.000000e+00, %.lr.ph ], [ %703, %187 ]
+ %240 = phi float [ 0.000000e+00, %.lr.ph ], [ %704, %187 ]
+ %241 = phi float [ 0.000000e+00, %.lr.ph ], [ %705, %187 ]
+ %242 = phi float [ 0.000000e+00, %.lr.ph ], [ %706, %187 ]
+ %243 = phi float [ 0.000000e+00, %.lr.ph ], [ %714, %187 ]
+ %244 = phi float [ 0.000000e+00, %.lr.ph ], [ %7...
[truncated]
|
I would guess that the condition you're checking for here is too strong. The MIR is not in SSA form and it's not necessary that every reload is dominated by a single spill, just that all spills together jointly dominate it. But maybe that's a harder condition to check. |
qcolombet
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the test case be reduced further?
Can we have a unit test exercises this one pass alone (i.e., .mir)?
Generally speaking, I'm not a fan of test that only check "do not crash"
+1 on that |
| source_filename = "LLVMDialectModule" | ||
| target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| source_filename = "LLVMDialectModule" | |
| target datalayout = "e-m:e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" |
| @global_smem = external addrspace(3) global [0 x i8] | ||
|
|
||
| define amdgpu_kernel void @kernel_crash(i32 inreg %0, ptr addrspace(1) inreg readonly captures(none) %1, i32 inreg %2, i32 %3, i32 %4, i32 %5, i32 %6, ptr addrspace(3) %7, ptr addrspace(3) %global_smem, i1 %8, i32 %9, i32 %10, i32 %11, i32 %12, <8 x half> %13, <8 x half> %14, <8 x half> %15, <8 x half> %16, <8 x half> %17, <8 x half> %18, <8 x half> %19, <8 x half> %20, <8 x half> %21, <8 x half> %22, <8 x half> %23, <8 x half> %24, <8 x half> %25, <8 x half> %26, float %27, ptr addrspace(3) %28, ptr addrspace(3) %29, ptr addrspace(3) %30, ptr addrspace(3) %31, ptr addrspace(3) %32, ptr addrspace(3) %33, ptr addrspace(3) %34, ptr addrspace(3) %35, ptr addrspace(3) %36, ptr addrspace(3) %37, ptr addrspace(3) %38, i1 %exitcond.not, <2 x float> %39, <2 x float> %40, ptr addrspace(3) %41, ptr addrspace(3) %42, ptr addrspace(3) %43, ptr addrspace(3) %44, ptr addrspace(3) %45, ptr addrspace(3) %46, ptr addrspace(3) %47, ptr addrspace(3) %48, ptr addrspace(3) %49, ptr addrspace(3) %50, ptr addrspace(3) %51, ptr addrspace(3) %52, ptr addrspace(3) %53, ptr addrspace(3) %54, ptr addrspace(3) %55, ptr addrspace(3) %56, ptr addrspace(3) %57, ptr addrspace(3) %58, ptr addrspace(3) %59, ptr addrspace(3) %60, i1 %61, <8 x half> %62, <8 x half> %63, <4 x float> %64, <4 x float> %65, <2 x float> %66, <2 x float> %67, i32 %68, <2 x float> %69, <2 x float> %70, <2 x float> %71, <2 x float> %72, <2 x float> %73, <2 x float> %74) #0 { | ||
| %76 = tail call i32 @llvm.amdgcn.workitem.id.x() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use named values
| declare void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16, ptr addrspace(8) writeonly captures(none), i32, i32, i32 immarg) #7 | ||
|
|
||
| ; uselistorder directives | ||
| uselistorder ptr @llvm.amdgcn.raw.ptr.buffer.load.i16, { 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't need all these uselistorders
Rev1: Updated condition to check for "joint domination", i.e. no reload is reachable from entry without reaching a store to the same slot. Still working on reduced test or unit test. When performing spill elimination in the AGPR copy rewrite pass it was possible to see spill reloads that were not jointly dominated by any store. This caused invalid MIR to be generated where vreg uses were not dominated by defs. This patch adds a joint dominance check before rewriting spills.
16c1de0 to
714bfc9
Compare

When performing spill elimination in the AGPR copy rewrite pass it was possible to see spill reloads that were not dominated by any store. This caused invalid MIR to be generated where vreg uses were not dominated by defs. This patch adds a dominance check before rewriting spills.