Skip to content

Conversation

@ronlieb
Copy link
Contributor

@ronlieb ronlieb commented Nov 29, 2025

Reverts #168259

breaks hip buildot

@llvmbot
Copy link
Member

llvmbot commented Nov 29, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-backend-x86

Author: theRonShark (ronlieb)

Changes

Reverts llvm/llvm-project#168259

breaks hip buildot


Patch is 560.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169990.diff

11 Files Affected:

  • (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+7-2)
  • (modified) llvm/lib/CodeGen/SplitKit.cpp (-48)
  • (modified) llvm/lib/CodeGen/SplitKit.h (-8)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2662-2707)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+95-92)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+143-150)
  • (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+31-31)
  • (modified) llvm/test/CodeGen/AMDGPU/spill-before-exec.mir (-5)
  • (removed) llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir (-167)
  • (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+268-269)
  • (modified) llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll (+38-40)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4db20dc39fb32..a059cb55371a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -774,7 +774,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
       // Abort if the spill cannot be inserted at the MBB' start
       if (((BC.Entry == SpillPlacement::MustSpill) ||
            (BC.Entry == SpillPlacement::PrefSpill)) &&
-          !SA->canSplitBeforeProlog(BC.Number))
+          SlotIndex::isEarlierInstr(BI.FirstInstr,
+                                    SA->getFirstSplitPoint(BC.Number)))
         return false;
     }
 
@@ -829,7 +830,11 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
     BCS[B].Number = Number;
 
     // Abort if the spill cannot be inserted at the MBB' start
-    if (!SA->canSplitBeforeProlog(Number))
+    MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
+    auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
+    if (FirstNonDebugInstr != MBB->end() &&
+        SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
+                                  SA->getFirstSplitPoint(Number)))
       return false;
     // Interference for the live-in value.
     if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f27ff674dcf8c..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,54 +147,6 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
   return LIS.getInstructionFromIndex(LIP);
 }
 
-bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
-                                               const MachineBasicBlock &MBB) {
-  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-
-  for (auto &MI : MBB) {
-    if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
-        MI.isPseudoProbe())
-      continue;
-
-    if (!TII->isBasicBlockPrologue(MI))
-      return true;
-
-    for (auto &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
-        continue;
-
-      // For the AMDGPU target if a MBB contains exec mask restore preamble,
-      // SplitEditor may get state when it cannot insert a spill instruction
-      // at the begin of the MBB.
-      // E.g. for a MIR
-      // bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     ...
-      //     use %1
-      // If the regalloc try to allocate a virtreg to the physreg already
-      // assigned to virtreg %1 and the pyhsreg is computed as the best
-      // candidate for split, it may insert COPY instruction.
-      //  bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     %2 = COPY %orig
-      //     ...
-      //     use %1
-      // Thus %1 and %orig still have interference. We may add cost for the
-      // physreg candidate or abandon the candidate.
-      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-      const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
-      const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
-      if (TRI->getCommonSubClass(RC, CurRC))
-        return false;
-    }
-  }
-
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //                                 Split Analysis
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index a9fc921534d0e..de255911268f2 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,9 +89,6 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
     return Res;
   }
 
-  /// Return true if we can split \pCurLI before \pMBB's prolog.
-  bool canSplitBeforeProlog(const LiveInterval &CurLI,
-                            const MachineBasicBlock &MBB);
 };
 
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -250,11 +247,6 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
   SlotIndex getFirstSplitPoint(unsigned Num) {
     return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
   }
-
-  bool canSplitBeforeProlog(unsigned Num) {
-    MachineBasicBlock *BB = MF.getBlockNumbered(Num);
-    return IPA.canSplitBeforeProlog(*CurLI, *BB);
-  }
 };
 
 /// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 10f7b701c3122..4c5c56a49fdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -151238,13 +151238,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
+; SI-NEXT:    s_mov_b32 s73, s21
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v44, s19, 0
 ; SI-NEXT:    v_writelane_b32 v44, s18, 1
 ; SI-NEXT:    v_writelane_b32 v44, s17, 2
 ; SI-NEXT:    v_writelane_b32 v44, s16, 3
+; SI-NEXT:    v_writelane_b32 v41, s30, 0
 ; SI-NEXT:    v_writelane_b32 v41, s31, 1
 ; SI-NEXT:    v_writelane_b32 v41, s34, 2
 ; SI-NEXT:    v_writelane_b32 v41, s35, 3
@@ -151268,8 +151268,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s69, 21
 ; SI-NEXT:    v_writelane_b32 v41, s70, 22
 ; SI-NEXT:    v_writelane_b32 v41, s71, 23
-; SI-NEXT:    s_mov_b32 s57, s28
-; SI-NEXT:    s_mov_b32 s47, s27
+; SI-NEXT:    s_mov_b32 s74, s29
+; SI-NEXT:    s_mov_b32 s78, s28
+; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s80, 24
 ; SI-NEXT:    v_writelane_b32 v41, s81, 25
 ; SI-NEXT:    v_writelane_b32 v41, s82, 26
@@ -151279,6 +151280,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    s_mov_b32 s47, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
@@ -151288,101 +151290,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
-; SI-NEXT:    v_writelane_b32 v42, s89, 0
-; SI-NEXT:    v_readfirstlane_b32 s91, v10
-; SI-NEXT:    v_writelane_b32 v42, s90, 1
-; SI-NEXT:    v_readfirstlane_b32 s92, v8
-; SI-NEXT:    v_writelane_b32 v42, s91, 2
-; SI-NEXT:    v_readfirstlane_b32 s93, v7
-; SI-NEXT:    v_writelane_b32 v42, s92, 3
-; SI-NEXT:    v_readfirstlane_b32 s94, v13
-; SI-NEXT:    v_writelane_b32 v42, s93, 4
-; SI-NEXT:    v_readfirstlane_b32 s95, v14
-; SI-NEXT:    v_writelane_b32 v42, s94, 5
-; SI-NEXT:    v_writelane_b32 v42, s95, 6
-; SI-NEXT:    v_readfirstlane_b32 s30, v17
-; SI-NEXT:    v_readfirstlane_b32 s31, v18
-; SI-NEXT:    v_readfirstlane_b32 s34, v16
-; SI-NEXT:    v_readfirstlane_b32 s35, v15
-; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    v_readfirstlane_b32 s37, v22
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
+; SI-NEXT:    v_writelane_b32 v43, s37, 0
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
+; SI-NEXT:    v_writelane_b32 v43, s38, 1
 ; SI-NEXT:    v_readfirstlane_b32 s48, v25
+; SI-NEXT:    v_writelane_b32 v43, s39, 2
 ; SI-NEXT:    v_readfirstlane_b32 s49, v26
+; SI-NEXT:    v_writelane_b32 v43, s48, 3
 ; SI-NEXT:    v_readfirstlane_b32 s50, v24
+; SI-NEXT:    v_writelane_b32 v43, s49, 4
 ; SI-NEXT:    v_readfirstlane_b32 s51, v23
+; SI-NEXT:    v_writelane_b32 v43, s50, 5
 ; SI-NEXT:    v_readfirstlane_b32 s52, v29
+; SI-NEXT:    v_writelane_b32 v43, s51, 6
 ; SI-NEXT:    v_readfirstlane_b32 s53, v30
+; SI-NEXT:    v_writelane_b32 v43, s52, 7
+; SI-NEXT:    v_readfirstlane_b32 s54, v28
+; SI-NEXT:    v_writelane_b32 v43, s53, 8
+; SI-NEXT:    v_readfirstlane_b32 s55, v27
+; SI-NEXT:    v_writelane_b32 v43, s54, 9
+; SI-NEXT:    v_writelane_b32 v43, s55, 10
+; SI-NEXT:    s_mov_b32 s57, s24
+; SI-NEXT:    v_readfirstlane_b32 s16, v1
+; SI-NEXT:    v_readfirstlane_b32 s17, v2
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 5
+; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v44, s4, 6
+; SI-NEXT:    v_writelane_b32 v44, s4, 5
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 7
+; SI-NEXT:    v_writelane_b32 v44, s4, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 8
+; SI-NEXT:    v_writelane_b32 v44, s4, 7
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 9
+; SI-NEXT:    v_writelane_b32 v44, s4, 8
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v44, s4, 10
+; SI-NEXT:    v_writelane_b32 v44, s4, 9
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 11
-; SI-NEXT:    v_readfirstlane_b32 s54, v28
-; SI-NEXT:    v_readfirstlane_b32 s55, v27
-; SI-NEXT:    s_mov_b32 s6, s23
-; SI-NEXT:    s_mov_b32 s23, s21
-; SI-NEXT:    s_mov_b32 s58, s26
-; SI-NEXT:    s_mov_b32 s40, s25
-; SI-NEXT:    s_mov_b32 s25, s24
-; SI-NEXT:    v_readfirstlane_b32 s16, v1
-; SI-NEXT:    v_readfirstlane_b32 s17, v2
+; SI-NEXT:    v_writelane_b32 v44, s4, 10
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
 ; SI-NEXT:    v_readfirstlane_b32 s77, v4
-; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s26, v53
-; SI-NEXT:    v_readfirstlane_b32 s46, v54
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s61, v55
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
+; SI-NEXT:    v_readfirstlane_b32 s91, v10
+; SI-NEXT:    v_readfirstlane_b32 s92, v8
+; SI-NEXT:    v_readfirstlane_b32 s93, v7
+; SI-NEXT:    v_readfirstlane_b32 s94, v13
+; SI-NEXT:    v_readfirstlane_b32 s95, v14
+; SI-NEXT:    v_readfirstlane_b32 s30, v17
+; SI-NEXT:    v_readfirstlane_b32 s31, v18
+; SI-NEXT:    v_readfirstlane_b32 s34, v16
+; SI-NEXT:    v_readfirstlane_b32 s35, v15
+; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s62, v40
+; SI-NEXT:    v_readfirstlane_b32 s24, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 12
+; SI-NEXT:    v_writelane_b32 v44, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 13
+; SI-NEXT:    v_writelane_b32 v44, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 14
+; SI-NEXT:    v_writelane_b32 v44, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 15
+; SI-NEXT:    v_writelane_b32 v44, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 16
+; SI-NEXT:    v_writelane_b32 v44, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -151392,51 +151388,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:228
-; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 18
+; SI-NEXT:    v_readfirstlane_b32 s75, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s21, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v44, s4, 19
+; SI-NEXT:    v_writelane_b32 v44, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 21
+; SI-NEXT:    v_readfirstlane_b32 s40, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 22
+; SI-NEXT:    v_readfirstlane_b32 s61, v36
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v44, s4, 23
+; SI-NEXT:    v_readfirstlane_b32 s63, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 24
+; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 25
+; SI-NEXT:    v_readfirstlane_b32 s56, v38
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 26
+; SI-NEXT:    v_readfirstlane_b32 s43, v39
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 27
+; SI-NEXT:    v_readfirstlane_b32 s46, v48
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 28
+; SI-NEXT:    v_readfirstlane_b32 s42, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 29
+; SI-NEXT:    v_readfirstlane_b32 s13, v50
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_readfirstlane_b32 s45, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -151444,47 +151429,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
-; SI-NEXT:    v_writelane_b32 v44, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 31
+; SI-NEXT:    v_readfirstlane_b32 s88, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s79, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    v_writelane_b32 v44, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 33
+; SI-NEXT:    v_writelane_b32 v44, s4, 18
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 34
+; SI-NEXT:    v_writelane_b32 v44, s4, 19
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 35
+; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s43, v37
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_writelane_b32 v44, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 36
+; SI-NEXT:    v_writelane_b32 v44, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 37
+; SI-NEXT:    v_writelane_b32 v44, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 38
+; SI-NEXT:    v_writelane_b32 v44, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 39
+; SI-NEXT:    v_writelane_b32 v44, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 40
+; SI-NEXT:    v_writelane_b32 v44, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 41
+; SI-NEXT:    v_writelane_b32 v44, s4, 27
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
-; SI-NEXT:    v_writelane_b32 v44, s4, 42
+; SI-NEXT:    v_writelane_b32 v44, s4, 28
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(3)
@@ -151500,31 +151483,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT:    v_writelane_b32 v44, s4, 43
-; SI-NEXT:    v_writelane_b32 v44, s22, 44
-; SI-NEXT:    v_writelane_b32 v44, s6, 45
-; SI-NEXT:    v_writelane_b32 v44, s23, 46
-; SI-NEXT:    v_writelane_b32 v44, s20, 47
-; SI-NEXT:    v_writelane_b32 v44, s58, 48
-; SI-NEXT:    v_writelane_b32 v44, s47, 49
-; SI-NEXT:    v_writelane_b32 v44, s40, 50
-; SI-NEXT:    v_writelane_b32 v44, s25, 51
-; SI-NEXT:    v_writelane_b32 v44, s29, 52
-; SI-NEXT:    v_writelane_b32 v44, s57, 53
-; SI-NEXT:    v_writelane_b32 v44, s62, 54
+; SI-NEXT:    v_writelane_b32 v44, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s21, v52
-; SI-NEXT:    v_writelane_b32 v44, s61, 55
-; SI-NEXT:    v_writelane_b32 v44, s21, 56
-; SI-NEXT:    v_writelane_b32 v44, s26, 57
-; SI-NEXT:    v_writelane_b32 v44, s46, 58
-; SI-NEXT:    v_writelane_b32 v44, s16, 59
-; SI-NEXT:    v_writelane_b32 v44, s17, 60
-; SI-NEXT:    v_writelane_b32 v44, s18, 61
-; SI-NEXT:    v_writelane_b32 v44, s...
[truncated]

@llvmbot
Copy link
Member

llvmbot commented Nov 29, 2025

@llvm/pr-subscribers-llvm-regalloc

Author: theRonShark (ronlieb)

Changes

Reverts llvm/llvm-project#168259

breaks hip buildot


Patch is 560.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169990.diff

11 Files Affected:

  • (modified) llvm/lib/CodeGen/RegAllocGreedy.cpp (+7-2)
  • (modified) llvm/lib/CodeGen/SplitKit.cpp (-48)
  • (modified) llvm/lib/CodeGen/SplitKit.h (-8)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2662-2707)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+95-92)
  • (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+143-150)
  • (modified) llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir (+31-31)
  • (modified) llvm/test/CodeGen/AMDGPU/spill-before-exec.mir (-5)
  • (removed) llvm/test/CodeGen/AMDGPU/spill-before-exec2.mir (-167)
  • (modified) llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll (+268-269)
  • (modified) llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll (+38-40)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 4db20dc39fb32..a059cb55371a3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -774,7 +774,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
       // Abort if the spill cannot be inserted at the MBB' start
       if (((BC.Entry == SpillPlacement::MustSpill) ||
            (BC.Entry == SpillPlacement::PrefSpill)) &&
-          !SA->canSplitBeforeProlog(BC.Number))
+          SlotIndex::isEarlierInstr(BI.FirstInstr,
+                                    SA->getFirstSplitPoint(BC.Number)))
         return false;
     }
 
@@ -829,7 +830,11 @@ bool RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
     BCS[B].Number = Number;
 
     // Abort if the spill cannot be inserted at the MBB' start
-    if (!SA->canSplitBeforeProlog(Number))
+    MachineBasicBlock *MBB = MF->getBlockNumbered(Number);
+    auto FirstNonDebugInstr = MBB->getFirstNonDebugInstr();
+    if (FirstNonDebugInstr != MBB->end() &&
+        SlotIndex::isEarlierInstr(LIS->getInstructionIndex(*FirstNonDebugInstr),
+                                  SA->getFirstSplitPoint(Number)))
       return false;
     // Interference for the live-in value.
     if (Intf.first() <= Indexes->getMBBStartIdx(Number))
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index f27ff674dcf8c..8ec4bfbb5a330 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -147,54 +147,6 @@ InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
   return LIS.getInstructionFromIndex(LIP);
 }
 
-bool InsertPointAnalysis::canSplitBeforeProlog(const LiveInterval &CurLI,
-                                               const MachineBasicBlock &MBB) {
-  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-
-  for (auto &MI : MBB) {
-    if (MI.isPHI() || MI.isPosition() || MI.isDebugInstr() ||
-        MI.isPseudoProbe())
-      continue;
-
-    if (!TII->isBasicBlockPrologue(MI))
-      return true;
-
-    for (auto &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual())
-        continue;
-
-      // For the AMDGPU target if a MBB contains exec mask restore preamble,
-      // SplitEditor may get state when it cannot insert a spill instruction
-      // at the begin of the MBB.
-      // E.g. for a MIR
-      // bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     ...
-      //     use %1
-      // If the regalloc try to allocate a virtreg to the physreg already
-      // assigned to virtreg %1 and the pyhsreg is computed as the best
-      // candidate for split, it may insert COPY instruction.
-      //  bb.100:
-      //     %1 = S_OR_SAVEEXEC_B64 %2, implicit-def $exec, implicit-def $scc,
-      //          implicit $exec
-      //     %2 = COPY %orig
-      //     ...
-      //     use %1
-      // Thus %1 and %orig still have interference. We may add cost for the
-      // physreg candidate or abandon the candidate.
-      const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-      const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
-      const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
-      const TargetRegisterClass *CurRC = MRI.getRegClass(CurLI.reg());
-      if (TRI->getCommonSubClass(RC, CurRC))
-        return false;
-    }
-  }
-
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //                                 Split Analysis
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index a9fc921534d0e..de255911268f2 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -89,9 +89,6 @@ class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
     return Res;
   }
 
-  /// Return true if we can split \pCurLI before \pMBB's prolog.
-  bool canSplitBeforeProlog(const LiveInterval &CurLI,
-                            const MachineBasicBlock &MBB);
 };
 
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
@@ -250,11 +247,6 @@ class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
   SlotIndex getFirstSplitPoint(unsigned Num) {
     return IPA.getFirstInsertPoint(*MF.getBlockNumbered(Num));
   }
-
-  bool canSplitBeforeProlog(unsigned Num) {
-    MachineBasicBlock *BB = MF.getBlockNumbered(Num);
-    return IPA.canSplitBeforeProlog(*CurLI, *BB);
-  }
 };
 
 /// SplitEditor - Edit machine code and LiveIntervals for live range
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 10f7b701c3122..4c5c56a49fdc6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -151238,13 +151238,13 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:308
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:304
 ; SI-NEXT:    ; implicit-def: $vgpr44 : SGPR spill to VGPR lane
-; SI-NEXT:    s_waitcnt expcnt(3)
-; SI-NEXT:    v_writelane_b32 v41, s30, 0
+; SI-NEXT:    s_mov_b32 s73, s21
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_writelane_b32 v44, s19, 0
 ; SI-NEXT:    v_writelane_b32 v44, s18, 1
 ; SI-NEXT:    v_writelane_b32 v44, s17, 2
 ; SI-NEXT:    v_writelane_b32 v44, s16, 3
+; SI-NEXT:    v_writelane_b32 v41, s30, 0
 ; SI-NEXT:    v_writelane_b32 v41, s31, 1
 ; SI-NEXT:    v_writelane_b32 v41, s34, 2
 ; SI-NEXT:    v_writelane_b32 v41, s35, 3
@@ -151268,8 +151268,9 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s69, 21
 ; SI-NEXT:    v_writelane_b32 v41, s70, 22
 ; SI-NEXT:    v_writelane_b32 v41, s71, 23
-; SI-NEXT:    s_mov_b32 s57, s28
-; SI-NEXT:    s_mov_b32 s47, s27
+; SI-NEXT:    s_mov_b32 s74, s29
+; SI-NEXT:    s_mov_b32 s78, s28
+; SI-NEXT:    s_mov_b32 s76, s27
 ; SI-NEXT:    v_writelane_b32 v41, s80, 24
 ; SI-NEXT:    v_writelane_b32 v41, s81, 25
 ; SI-NEXT:    v_writelane_b32 v41, s82, 26
@@ -151279,6 +151280,7 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    v_writelane_b32 v41, s86, 30
 ; SI-NEXT:    v_writelane_b32 v41, s87, 31
 ; SI-NEXT:    v_writelane_b32 v41, s96, 32
+; SI-NEXT:    s_mov_b32 s47, s26
 ; SI-NEXT:    v_writelane_b32 v41, s97, 33
 ; SI-NEXT:    v_writelane_b32 v41, s98, 34
 ; SI-NEXT:    v_writelane_b32 v41, s99, 35
@@ -151288,101 +151290,95 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:156
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:152
-; SI-NEXT:    v_readfirstlane_b32 s89, v3
-; SI-NEXT:    ; implicit-def: $vgpr42 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s90, v9
-; SI-NEXT:    v_writelane_b32 v42, s89, 0
-; SI-NEXT:    v_readfirstlane_b32 s91, v10
-; SI-NEXT:    v_writelane_b32 v42, s90, 1
-; SI-NEXT:    v_readfirstlane_b32 s92, v8
-; SI-NEXT:    v_writelane_b32 v42, s91, 2
-; SI-NEXT:    v_readfirstlane_b32 s93, v7
-; SI-NEXT:    v_writelane_b32 v42, s92, 3
-; SI-NEXT:    v_readfirstlane_b32 s94, v13
-; SI-NEXT:    v_writelane_b32 v42, s93, 4
-; SI-NEXT:    v_readfirstlane_b32 s95, v14
-; SI-NEXT:    v_writelane_b32 v42, s94, 5
-; SI-NEXT:    v_writelane_b32 v42, s95, 6
-; SI-NEXT:    v_readfirstlane_b32 s30, v17
-; SI-NEXT:    v_readfirstlane_b32 s31, v18
-; SI-NEXT:    v_readfirstlane_b32 s34, v16
-; SI-NEXT:    v_readfirstlane_b32 s35, v15
-; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    v_readfirstlane_b32 s37, v22
+; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
 ; SI-NEXT:    v_readfirstlane_b32 s38, v20
+; SI-NEXT:    v_writelane_b32 v43, s37, 0
 ; SI-NEXT:    v_readfirstlane_b32 s39, v19
+; SI-NEXT:    v_writelane_b32 v43, s38, 1
 ; SI-NEXT:    v_readfirstlane_b32 s48, v25
+; SI-NEXT:    v_writelane_b32 v43, s39, 2
 ; SI-NEXT:    v_readfirstlane_b32 s49, v26
+; SI-NEXT:    v_writelane_b32 v43, s48, 3
 ; SI-NEXT:    v_readfirstlane_b32 s50, v24
+; SI-NEXT:    v_writelane_b32 v43, s49, 4
 ; SI-NEXT:    v_readfirstlane_b32 s51, v23
+; SI-NEXT:    v_writelane_b32 v43, s50, 5
 ; SI-NEXT:    v_readfirstlane_b32 s52, v29
+; SI-NEXT:    v_writelane_b32 v43, s51, 6
 ; SI-NEXT:    v_readfirstlane_b32 s53, v30
+; SI-NEXT:    v_writelane_b32 v43, s52, 7
+; SI-NEXT:    v_readfirstlane_b32 s54, v28
+; SI-NEXT:    v_writelane_b32 v43, s53, 8
+; SI-NEXT:    v_readfirstlane_b32 s55, v27
+; SI-NEXT:    v_writelane_b32 v43, s54, 9
+; SI-NEXT:    v_writelane_b32 v43, s55, 10
+; SI-NEXT:    s_mov_b32 s57, s24
+; SI-NEXT:    v_readfirstlane_b32 s16, v1
+; SI-NEXT:    v_readfirstlane_b32 s17, v2
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
+; SI-NEXT:    v_readfirstlane_b32 s6, v31
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:300
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:296
 ; SI-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:292
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:288
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:284
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:280
-; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 5
+; SI-NEXT:    v_writelane_b32 v44, s4, 4
 ; SI-NEXT:    v_readfirstlane_b32 s4, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:276
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:272
-; SI-NEXT:    v_writelane_b32 v44, s4, 6
+; SI-NEXT:    v_writelane_b32 v44, s4, 5
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 7
+; SI-NEXT:    v_writelane_b32 v44, s4, 6
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 8
+; SI-NEXT:    v_writelane_b32 v44, s4, 7
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 9
+; SI-NEXT:    v_writelane_b32 v44, s4, 8
 ; SI-NEXT:    v_readfirstlane_b32 s4, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:268
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:264
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:260
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:256
-; SI-NEXT:    v_writelane_b32 v44, s4, 10
+; SI-NEXT:    v_writelane_b32 v44, s4, 9
 ; SI-NEXT:    s_waitcnt vmcnt(14)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 11
-; SI-NEXT:    v_readfirstlane_b32 s54, v28
-; SI-NEXT:    v_readfirstlane_b32 s55, v27
-; SI-NEXT:    s_mov_b32 s6, s23
-; SI-NEXT:    s_mov_b32 s23, s21
-; SI-NEXT:    s_mov_b32 s58, s26
-; SI-NEXT:    s_mov_b32 s40, s25
-; SI-NEXT:    s_mov_b32 s25, s24
-; SI-NEXT:    v_readfirstlane_b32 s16, v1
-; SI-NEXT:    v_readfirstlane_b32 s17, v2
+; SI-NEXT:    v_writelane_b32 v44, s4, 10
 ; SI-NEXT:    v_readfirstlane_b32 s18, v5
 ; SI-NEXT:    v_readfirstlane_b32 s19, v6
 ; SI-NEXT:    v_readfirstlane_b32 s77, v4
-; SI-NEXT:    ; implicit-def: $vgpr43 : SGPR spill to VGPR lane
-; SI-NEXT:    v_readfirstlane_b32 s26, v53
-; SI-NEXT:    v_readfirstlane_b32 s46, v54
-; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s61, v55
+; SI-NEXT:    v_readfirstlane_b32 s89, v3
+; SI-NEXT:    v_readfirstlane_b32 s90, v9
+; SI-NEXT:    v_readfirstlane_b32 s91, v10
+; SI-NEXT:    v_readfirstlane_b32 s92, v8
+; SI-NEXT:    v_readfirstlane_b32 s93, v7
+; SI-NEXT:    v_readfirstlane_b32 s94, v13
+; SI-NEXT:    v_readfirstlane_b32 s95, v14
+; SI-NEXT:    v_readfirstlane_b32 s30, v17
+; SI-NEXT:    v_readfirstlane_b32 s31, v18
+; SI-NEXT:    v_readfirstlane_b32 s34, v16
+; SI-NEXT:    v_readfirstlane_b32 s35, v15
+; SI-NEXT:    v_readfirstlane_b32 s36, v21
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s62, v40
+; SI-NEXT:    v_readfirstlane_b32 s24, v40
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 12
+; SI-NEXT:    v_writelane_b32 v44, s4, 11
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 13
+; SI-NEXT:    v_writelane_b32 v44, s4, 12
 ; SI-NEXT:    s_waitcnt vmcnt(9)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 14
+; SI-NEXT:    v_writelane_b32 v44, s4, 13
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 15
+; SI-NEXT:    v_writelane_b32 v44, s4, 14
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 16
+; SI-NEXT:    v_writelane_b32 v44, s4, 15
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:252
@@ -151392,51 +151388,40 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:236
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:232
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:228
-; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 18
+; SI-NEXT:    v_readfirstlane_b32 s75, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s21, v33
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:224
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:220
-; SI-NEXT:    v_writelane_b32 v44, s4, 19
+; SI-NEXT:    v_writelane_b32 v44, s4, 16
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 21
+; SI-NEXT:    v_readfirstlane_b32 s40, v35
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 22
+; SI-NEXT:    v_readfirstlane_b32 s61, v36
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v37
-; SI-NEXT:    v_writelane_b32 v44, s4, 23
+; SI-NEXT:    v_readfirstlane_b32 s63, v37
 ; SI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:216
 ; SI-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:212
 ; SI-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:208
 ; SI-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:204
+; SI-NEXT:    v_writelane_b32 v44, s4, 17
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 24
+; SI-NEXT:    v_readfirstlane_b32 s59, v31
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 25
+; SI-NEXT:    v_readfirstlane_b32 s56, v38
 ; SI-NEXT:    s_waitcnt vmcnt(10)
-; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 26
+; SI-NEXT:    v_readfirstlane_b32 s43, v39
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 27
+; SI-NEXT:    v_readfirstlane_b32 s46, v48
 ; SI-NEXT:    s_waitcnt vmcnt(8)
-; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 28
+; SI-NEXT:    v_readfirstlane_b32 s42, v49
 ; SI-NEXT:    s_waitcnt vmcnt(7)
-; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 29
+; SI-NEXT:    v_readfirstlane_b32 s13, v50
 ; SI-NEXT:    s_waitcnt vmcnt(6)
-; SI-NEXT:    v_readfirstlane_b32 s4, v51
+; SI-NEXT:    v_readfirstlane_b32 s45, v51
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:200
 ; SI-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:196
 ; SI-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:192
@@ -151444,47 +151429,45 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:184
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:180
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:176
-; SI-NEXT:    v_writelane_b32 v44, s4, 30
 ; SI-NEXT:    s_waitcnt vmcnt(12)
-; SI-NEXT:    v_readfirstlane_b32 s4, v32
-; SI-NEXT:    v_writelane_b32 v44, s4, 31
+; SI-NEXT:    v_readfirstlane_b32 s88, v32
 ; SI-NEXT:    s_waitcnt vmcnt(11)
-; SI-NEXT:    v_readfirstlane_b32 s4, v33
+; SI-NEXT:    v_readfirstlane_b32 s79, v33
 ; SI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:172
 ; SI-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:168
-; SI-NEXT:    v_writelane_b32 v44, s4, 32
 ; SI-NEXT:    s_waitcnt vmcnt(12)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v34
-; SI-NEXT:    v_writelane_b32 v44, s4, 33
+; SI-NEXT:    v_writelane_b32 v44, s4, 18
 ; SI-NEXT:    s_waitcnt vmcnt(11)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v35
-; SI-NEXT:    v_writelane_b32 v44, s4, 34
+; SI-NEXT:    v_writelane_b32 v44, s4, 19
 ; SI-NEXT:    s_waitcnt vmcnt(10)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v36
-; SI-NEXT:    v_writelane_b32 v44, s4, 35
+; SI-NEXT:    v_writelane_b32 v44, s4, 20
 ; SI-NEXT:    s_waitcnt vmcnt(9)
-; SI-NEXT:    v_readfirstlane_b32 s43, v37
+; SI-NEXT:    v_readfirstlane_b32 s4, v37
+; SI-NEXT:    v_writelane_b32 v44, s4, 21
 ; SI-NEXT:    s_waitcnt vmcnt(8)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v31
-; SI-NEXT:    v_writelane_b32 v44, s4, 36
+; SI-NEXT:    v_writelane_b32 v44, s4, 22
 ; SI-NEXT:    s_waitcnt vmcnt(7)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v38
-; SI-NEXT:    v_writelane_b32 v44, s4, 37
+; SI-NEXT:    v_writelane_b32 v44, s4, 23
 ; SI-NEXT:    s_waitcnt vmcnt(6)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v39
-; SI-NEXT:    v_writelane_b32 v44, s4, 38
+; SI-NEXT:    v_writelane_b32 v44, s4, 24
 ; SI-NEXT:    s_waitcnt vmcnt(5)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v48
-; SI-NEXT:    v_writelane_b32 v44, s4, 39
+; SI-NEXT:    v_writelane_b32 v44, s4, 25
 ; SI-NEXT:    s_waitcnt vmcnt(4)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v49
-; SI-NEXT:    v_writelane_b32 v44, s4, 40
+; SI-NEXT:    v_writelane_b32 v44, s4, 26
 ; SI-NEXT:    s_waitcnt vmcnt(3)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v50
-; SI-NEXT:    v_writelane_b32 v44, s4, 41
+; SI-NEXT:    v_writelane_b32 v44, s4, 27
 ; SI-NEXT:    s_waitcnt vmcnt(2)
 ; SI-NEXT:    v_readfirstlane_b32 s4, v51
-; SI-NEXT:    v_writelane_b32 v44, s4, 42
+; SI-NEXT:    v_writelane_b32 v44, s4, 28
 ; SI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:148
 ; SI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:144
 ; SI-NEXT:    s_waitcnt vmcnt(3)
@@ -151500,31 +151483,41 @@ define inreg <64 x bfloat> @bitcast_v128i8_to_v64bf16_scalar(<128 x i8> inreg %a
 ; SI-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:112
 ; SI-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:108
 ; SI-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:104
-; SI-NEXT:    v_writelane_b32 v44, s4, 43
-; SI-NEXT:    v_writelane_b32 v44, s22, 44
-; SI-NEXT:    v_writelane_b32 v44, s6, 45
-; SI-NEXT:    v_writelane_b32 v44, s23, 46
-; SI-NEXT:    v_writelane_b32 v44, s20, 47
-; SI-NEXT:    v_writelane_b32 v44, s58, 48
-; SI-NEXT:    v_writelane_b32 v44, s47, 49
-; SI-NEXT:    v_writelane_b32 v44, s40, 50
-; SI-NEXT:    v_writelane_b32 v44, s25, 51
-; SI-NEXT:    v_writelane_b32 v44, s29, 52
-; SI-NEXT:    v_writelane_b32 v44, s57, 53
-; SI-NEXT:    v_writelane_b32 v44, s62, 54
+; SI-NEXT:    v_writelane_b32 v44, s4, 29
 ; SI-NEXT:    s_waitcnt vmcnt(13)
-; SI-NEXT:    v_readfirstlane_b32 s21, v52
-; SI-NEXT:    v_writelane_b32 v44, s61, 55
-; SI-NEXT:    v_writelane_b32 v44, s21, 56
-; SI-NEXT:    v_writelane_b32 v44, s26, 57
-; SI-NEXT:    v_writelane_b32 v44, s46, 58
-; SI-NEXT:    v_writelane_b32 v44, s16, 59
-; SI-NEXT:    v_writelane_b32 v44, s17, 60
-; SI-NEXT:    v_writelane_b32 v44, s18, 61
-; SI-NEXT:    v_writelane_b32 v44, s...
[truncated]

@github-actions
Copy link

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff origin/main HEAD --extensions h,cpp -- llvm/lib/CodeGen/RegAllocGreedy.cpp llvm/lib/CodeGen/SplitKit.cpp llvm/lib/CodeGen/SplitKit.h --diff_from_common_commit

⚠️
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing origin/main to the base branch/commit you want to compare against.
⚠️

View the diff from clang-format here.
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index de2559112..15b0f0515 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -88,7 +88,6 @@ public:
     }
     return Res;
   }
-
 };
 
 /// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting

@LuoYuanke
Copy link
Contributor

LuoYuanke commented Nov 29, 2025

Pls feel free to revert the patch if it breaks HIP. I try to look into the issue, but unfortunately I am not able to duplicate the issue locally. Can someone help to dump the assembly with the patch (#168259) and without the patch (#168259)?

@ronlieb ronlieb merged commit 3a1079f into main Nov 29, 2025
14 of 15 checks passed
@ronlieb ronlieb deleted the revert-168259-ra-split-point branch November 29, 2025 13:01
@llvm-ci
Copy link
Collaborator

llvm-ci commented Nov 29, 2025

LLVM Buildbot has detected a new failure on builder llvm-nvptx-nvidia-win running on as-builder-8 while building llvm at step 7 "test-build-unified-tree-check-llvm".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/54/builds/15135

Here is the relevant piece of the build log for the reference
Step 7 (test-build-unified-tree-check-llvm) failure: test (failure)
******************** TEST 'LLVM-Unit :: Support/./SupportTests.exe/79/105' FAILED ********************
Script(shard):
--
GTEST_OUTPUT=json:C:\buildbot\as-builder-8\llvm-nvptx-nvidia-win\build\unittests\Support\.\SupportTests.exe-LLVM-Unit-13744-79-105.json GTEST_SHUFFLE=0 GTEST_TOTAL_SHARDS=105 GTEST_SHARD_INDEX=79 C:\buildbot\as-builder-8\llvm-nvptx-nvidia-win\build\unittests\Support\.\SupportTests.exe
--


Note: This is test shard 80 of 105.

[==========] Running 16 tests from 16 test suites.

[----------] Global test environment set-up.

[----------] 1 test from BinaryStreamTest

[ RUN      ] BinaryStreamTest.DropOperations

[       OK ] BinaryStreamTest.DropOperations (0 ms)

[----------] 1 test from BinaryStreamTest (0 ms total)



[----------] 1 test from CommandLineTest

[ RUN      ] CommandLineTest.TokenizeAndMarkEOLs

[       OK ] CommandLineTest.TokenizeAndMarkEOLs (0 ms)

[----------] 1 test from CommandLineTest (0 ms total)



[----------] 1 test from DataExtractorTest

[ RUN      ] DataExtractorTest.LEB128_error

[       OK ] DataExtractorTest.LEB128_error (0 ms)

[----------] 1 test from DataExtractorTest (0 ms total)



[----------] 1 test from Error

[ RUN      ] Error.ForwardToExpected

[       OK ] Error.ForwardToExpected (0 ms)

[----------] 1 test from Error (0 ms total)
...

aahrun pushed a commit to aahrun/llvm-project that referenced this pull request Dec 1, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants