From fa8aded09f316b69dfdf29791309015b01bb32dc Mon Sep 17 00:00:00 2001 From: John Lu Date: Fri, 7 Nov 2025 13:50:47 -0600 Subject: [PATCH 01/10] Invert uses to delete s_cmp_eq* Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 72 ++++++++++++++++--- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 5 ++ .../branch-folding-implicit-def-subreg.ll | 9 ++- llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++--- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 33 +++++++++ .../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +-- 6 files changed, 114 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7cb7f47ddb220..8fc551e7b9650 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10737,12 +10737,64 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { + MachineBasicBlock *MBB = SCCDef->getParent(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + SmallVector InvertInstr; + bool SCCIsDead = false; + + // Scan instructions for SCC uses that need to be inverted until SCC is dead. + for (MachineInstr &MI : + make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) { + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, TRI, false) != -1) { + if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || + MI.getOpcode() == AMDGPU::S_CSELECT_B64 || + MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI.getOpcode() == AMDGPU::S_CBRANCH_SCC1) + InvertInstr.push_back(&MI); + else + return false; + } + if (MI.modifiesRegister(AMDGPU::SCC, TRI)) { + SCCIsDead = true; + break; + } + } + + const MachineRegisterInfo &MRI = + SCCDef->getParent()->getParent()->getRegInfo(); + // If SCC is still live, verify that it is not live past the end of this + // block. + if (!SCCIsDead && MRI.tracksLiveness()) + SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) == + MachineBasicBlock::LQR_Dead; + + // Invert uses + if (SCCIsDead) { + for (auto &MI : InvertInstr) { + if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 || + MI->getOpcode() == AMDGPU::S_CSELECT_B64) + swapOperands(*MI); + else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) + MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 + ? AMDGPU::S_CBRANCH_SCC1 + : AMDGPU::S_CBRANCH_SCC0)); + else + llvm_unreachable("SCC used but no inversion handling"); + } + return true; + } + return false; +} + // SCC is already valid after SCCValid. // SCCRedefine will redefine SCC to the same value already available after // SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and // update kill/dead flags if necessary. -static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, - const SIRegisterInfo &RI) { +bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI, + bool NeedInversion) const { MachineInstr *KillsSCC = nullptr; if (SCCValid->getParent() != SCCRedefine->getParent()) return false; @@ -10753,6 +10805,8 @@ static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, if (MI.killsRegister(AMDGPU::SCC, &RI)) KillsSCC = &MI; } + if (NeedInversion && !invertSCCUse(SCCRedefine)) + return false; if (MachineOperand *SccDef = SCCValid->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr)) SccDef->setIsDead(false); @@ -10786,7 +10840,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, return false; const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI, - this]() -> bool { + this](bool NeedInversion) -> bool { if (CmpValue != 0) return false; @@ -10807,7 +10861,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - if (!optimizeSCC(Def, &CmpInstr, RI)) + if (!optimizeSCC(Def, &CmpInstr, RI, NeedInversion)) return false; // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit @@ -10832,7 +10886,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); if (Select && foldableSelect(*Select)) - optimizeSCC(Select, Def, RI); + optimizeSCC(Select, Def, RI, false); } } } @@ -10913,7 +10967,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - if (!optimizeSCC(Def, &CmpInstr, RI)) + if (!optimizeSCC(Def, &CmpInstr, RI, false)) return false; if (!MRI->use_nodbg_empty(DefReg)) { @@ -10944,7 +10998,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_EQ_I32: case AMDGPU::S_CMPK_EQ_U32: case AMDGPU::S_CMPK_EQ_I32: - return optimizeCmpAnd(1, 32, true, false); + return optimizeCmpAnd(1, 32, true, false) || optimizeCmpSelect(true); case AMDGPU::S_CMP_GE_U32: case AMDGPU::S_CMPK_GE_U32: return optimizeCmpAnd(1, 32, false, false); @@ -10957,7 +11011,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMP_LG_I32: case AMDGPU::S_CMPK_LG_U32: case AMDGPU::S_CMPK_LG_I32: - return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect(false); case AMDGPU::S_CMP_GT_U32: case AMDGPU::S_CMPK_GT_U32: return optimizeCmpAnd(0, 32, false, false); @@ -10965,7 +11019,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, case AMDGPU::S_CMPK_GT_I32: return optimizeCmpAnd(0, 32, false, true); case AMDGPU::S_CMP_LG_U64: - return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(); + return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect(false); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index c66985a19685b..95f367ea0c154 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -125,6 +125,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { unsigned SubIdx, const TargetRegisterClass *SubRC) const; private: + bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, + const SIRegisterInfo &RI, bool NeedInversion) const; + + bool invertSCCUse(MachineInstr *SCCDef) const; + void swapOperands(MachineInstr &Inst) const; std::pair diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 026b8ba2759f0..274a88c930130 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -701,11 +701,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc - ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc + ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc + ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 72c2003058a01..1233c1fe12f72 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -879,8 +879,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: s_lshl_b32 s6, s4, 7 ; SI-NEXT: s_or_b32 s6, s5, s6 -; SI-NEXT: s_cmp_eq_u32 s6, 0 -; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: s_cselect_b32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -893,8 +892,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s2, s2, s3 +; VI-NEXT: s_cselect_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -908,8 +906,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 ; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -935,8 +932,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 ; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s3, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -946,11 +942,9 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s4, s3, s4 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 s2, s3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index b5228e3054f0a..6fb1d49e53c45 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s +; Test deletion of redundant s_cmp* sX, 0 instructions. declare i32 @llvm.ctpop.i32(i32) declare i64 @llvm.ctpop.i64(i64) @@ -20,6 +21,38 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ret i32 %zext } +; s_lshl_b32 sets SCC if result is non-zero. +; Deletion of equal to zero comparison will require inversion of use. +define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: shl32_eq: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, s1 +; CHECK-NEXT: s_cselect_b64 s[0:1], 0, -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: ; return to shader part epilog + %result = shl i32 %val0, %val1 + %cmp = icmp eq i32 %result, 0 + %zext = zext i1 %cmp to i32 + ret i32 %zext +} + +; 64-bit selection will generate two 32-bit selects. Inversion of multiple +; uses is required. +define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: shl32_eq_multi_use: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b32 s2, 0, s2 +; CHECK-NEXT: s_cselect_b32 s0, 0, s1 +; CHECK-NEXT: s_mov_b32 s1, s2 +; CHECK-NEXT: ; return to shader part epilog + %result = shl i32 %val0, 1 + %cmp = icmp eq i32 %result, 0 + %val64 = select i1 %cmp, i64 %val1, i64 0 + ret i64 %val64 +} + define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { ; CHECK-LABEL: shl64: ; CHECK: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 4445383bd0ace..4ef2ac1f59e07 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -180,8 +180,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -190,8 +189,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0 -; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -208,9 +206,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe -; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0 -; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31] From 7cac738128d2a48cfe4513a64b0274df838e9578 Mon Sep 17 00:00:00 2001 From: John Lu Date: Tue, 11 Nov 2025 09:17:37 -0600 Subject: [PATCH 02/10] Streamline code Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 34 +++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8fc551e7b9650..0540461f33859 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10746,7 +10746,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { // Scan instructions for SCC uses that need to be inverted until SCC is dead. for (MachineInstr &MI : make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) { - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, TRI, false) != -1) { + if (MI.readsRegister(AMDGPU::SCC, TRI)) { if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || MI.getOpcode() == AMDGPU::S_CSELECT_B64 || MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 || @@ -10755,7 +10755,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { else return false; } - if (MI.modifiesRegister(AMDGPU::SCC, TRI)) { + if (MI.definesRegister(AMDGPU::SCC, TRI)) { SCCIsDead = true; break; } @@ -10769,23 +10769,23 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) == MachineBasicBlock::LQR_Dead; + if (!SCCIsDead) + return false; + // Invert uses - if (SCCIsDead) { - for (auto &MI : InvertInstr) { - if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 || - MI->getOpcode() == AMDGPU::S_CSELECT_B64) - swapOperands(*MI); - else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || - MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) - MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 - ? AMDGPU::S_CBRANCH_SCC1 - : AMDGPU::S_CBRANCH_SCC0)); - else - llvm_unreachable("SCC used but no inversion handling"); - } - return true; + for (MachineInstr *MI : InvertInstr) { + if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 || + MI->getOpcode() == AMDGPU::S_CSELECT_B64) + swapOperands(*MI); + else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) + MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 + ? AMDGPU::S_CBRANCH_SCC1 + : AMDGPU::S_CBRANCH_SCC0)); + else + llvm_unreachable("SCC used but no inversion handling"); } - return false; + return true; } // SCC is already valid after SCCValid. From 781e6114b448170acc69c554df5f55eac66d0119 Mon Sep 17 00:00:00 2001 From: John Lu Date: Thu, 13 Nov 2025 15:33:40 -0600 Subject: [PATCH 03/10] Remove unnecessary parm, add comment, address feedback Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 32 ++++++++++++++------------ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 0540461f33859..9a5fb178d5ae2 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10737,16 +10737,18 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg, return false; } +// Invert all uses of SCC following SCCDef because SCCDef may be deleted and +// (incoming SCC) = !(SCC defined by SCCDef). +// Return true if all uses can be re-written, false otherwise. bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { MachineBasicBlock *MBB = SCCDef->getParent(); - const SIRegisterInfo *TRI = ST.getRegisterInfo(); - SmallVector InvertInstr; + SmallVector InvertInstr; bool SCCIsDead = false; // Scan instructions for SCC uses that need to be inverted until SCC is dead. for (MachineInstr &MI : make_range(std::next(MachineBasicBlock::iterator(SCCDef)), MBB->end())) { - if (MI.readsRegister(AMDGPU::SCC, TRI)) { + if (MI.readsRegister(AMDGPU::SCC, &RI)) { if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || MI.getOpcode() == AMDGPU::S_CSELECT_B64 || MI.getOpcode() == AMDGPU::S_CBRANCH_SCC0 || @@ -10755,18 +10757,18 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { else return false; } - if (MI.definesRegister(AMDGPU::SCC, TRI)) { + if (MI.definesRegister(AMDGPU::SCC, &RI) || + MI.killsRegister(AMDGPU::SCC, &RI)) { SCCIsDead = true; break; } } - const MachineRegisterInfo &MRI = - SCCDef->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = SCCDef->getMF()->getRegInfo(); // If SCC is still live, verify that it is not live past the end of this // block. if (!SCCIsDead && MRI.tracksLiveness()) - SCCIsDead = MBB->computeRegisterLiveness(TRI, AMDGPU::SCC, MBB->end(), 0) == + SCCIsDead = MBB->computeRegisterLiveness(&RI, AMDGPU::SCC, MBB->end(), 0) == MachineBasicBlock::LQR_Dead; if (!SCCIsDead) @@ -10775,15 +10777,16 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { // Invert uses for (MachineInstr *MI : InvertInstr) { if (MI->getOpcode() == AMDGPU::S_CSELECT_B32 || - MI->getOpcode() == AMDGPU::S_CSELECT_B64) + MI->getOpcode() == AMDGPU::S_CSELECT_B64) { swapOperands(*MI); - else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || - MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) + } else if (MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 || + MI->getOpcode() == AMDGPU::S_CBRANCH_SCC1) { MI->setDesc(get(MI->getOpcode() == AMDGPU::S_CBRANCH_SCC0 ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_SCC0)); - else + } else { llvm_unreachable("SCC used but no inversion handling"); + } } return true; } @@ -10793,7 +10796,6 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { // SCCValid. If there are no intervening SCC conflicts delete SCCRedefine and // update kill/dead flags if necessary. bool SIInstrInfo::optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, - const SIRegisterInfo &RI, bool NeedInversion) const { MachineInstr *KillsSCC = nullptr; if (SCCValid->getParent() != SCCRedefine->getParent()) @@ -10861,7 +10863,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(*Def)) return false; - if (!optimizeSCC(Def, &CmpInstr, RI, NeedInversion)) + if (!optimizeSCC(Def, &CmpInstr, NeedInversion)) return false; // If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit @@ -10886,7 +10888,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) { MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg()); if (Select && foldableSelect(*Select)) - optimizeSCC(Select, Def, RI, false); + optimizeSCC(Select, Def, false); } } } @@ -10967,7 +10969,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg)) return false; - if (!optimizeSCC(Def, &CmpInstr, RI, false)) + if (!optimizeSCC(Def, &CmpInstr, false)) return false; if (!MRI->use_nodbg_empty(DefReg)) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 95f367ea0c154..3fffe85eb55d7 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -126,7 +126,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { private: bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine, - const SIRegisterInfo &RI, bool NeedInversion) const; + bool NeedInversion) const; bool invertSCCUse(MachineInstr *SCCDef) const; From 139b622009b99df08d640c067fdb48ca7bf593f8 Mon Sep 17 00:00:00 2001 From: John Lu Date: Thu, 13 Nov 2025 15:52:42 -0600 Subject: [PATCH 04/10] Use new isLiveOut method Signed-off-by: John Lu --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 5 +++++ llvm/lib/CodeGen/MachineBasicBlock.cpp | 7 +++++++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +------ llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 5 ++--- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index fcf7bab09fcff..8f76d2ad5ef53 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -514,6 +514,11 @@ class MachineBasicBlock LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()) const; + /// Return true if the specified register is live out (i.e. in the live in set + /// of a successor) + LLVM_ABI bool isLiveOut(MCRegister Reg, + LaneBitmask LaneMask = LaneBitmask::getAll()) const; + // Iteration support for live in sets. These sets are kept in sorted // order by their register number. using livein_iterator = LiveInVector::const_iterator; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index ba0b025167307..35a9da95b3b9f 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -639,6 +639,13 @@ bool MachineBasicBlock::isLiveIn(MCRegister Reg, LaneBitmask LaneMask) const { return I != livein_end() && (I->LaneMask & LaneMask).any(); } +bool MachineBasicBlock::isLiveOut(MCRegister Reg, LaneBitmask LaneMask) const { + for (MachineBasicBlock *S : successors()) + if (S->isLiveIn(Reg, LaneMask)) + return true; + return false; +} + void MachineBasicBlock::sortUniqueLiveIns() { llvm::sort(LiveIns, [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9a5fb178d5ae2..99a07876a0189 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10764,14 +10764,9 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { } } - const MachineRegisterInfo &MRI = SCCDef->getMF()->getRegInfo(); // If SCC is still live, verify that it is not live past the end of this // block. - if (!SCCIsDead && MRI.tracksLiveness()) - SCCIsDead = MBB->computeRegisterLiveness(&RI, AMDGPU::SCC, MBB->end(), 0) == - MachineBasicBlock::LQR_Dead; - - if (!SCCIsDead) + if (!SCCIsDead && MBB->isLiveOut(AMDGPU::SCC)) return false; // Invert uses diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index fa452f3717f0e..442bf4f7e6d64 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -1476,9 +1476,8 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def dead $scc - ; GCN-NEXT: S_CMP_EQ_U32 [[S_AND_B32_]], 0, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: From 3bf05ff7c467d79c7f4f990a115543d1d5b39ac8 Mon Sep 17 00:00:00 2001 From: John Lu Date: Thu, 13 Nov 2025 16:16:16 -0600 Subject: [PATCH 05/10] Update test check Signed-off-by: John Lu --- .../test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 274a88c930130..c6cc3922d3952 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -702,8 +702,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc - ; GFX90A-NEXT: renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec - ; GFX90A-NEXT: renamable $vgpr9, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: From 65ccbf46710f1aae58236b3b285003c809bcd129 Mon Sep 17 00:00:00 2001 From: John Lu Date: Sun, 16 Nov 2025 01:29:17 -0600 Subject: [PATCH 06/10] Don't rely on isLiveOut. Use defines and kill flags Signed-off-by: John Lu --- llvm/include/llvm/CodeGen/MachineBasicBlock.h | 5 -- llvm/lib/CodeGen/MachineBasicBlock.cpp | 7 --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- .../branch-folding-implicit-def-subreg.ll | 5 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++++-- llvm/test/CodeGen/AMDGPU/optimize-compare.mir | 5 +- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 57 ++++++++++++------- .../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +++- 8 files changed, 64 insertions(+), 45 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h index 8f76d2ad5ef53..fcf7bab09fcff 100644 --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -514,11 +514,6 @@ class MachineBasicBlock LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask = LaneBitmask::getAll()) const; - /// Return true if the specified register is live out (i.e. in the live in set - /// of a successor) - LLVM_ABI bool isLiveOut(MCRegister Reg, - LaneBitmask LaneMask = LaneBitmask::getAll()) const; - // Iteration support for live in sets. These sets are kept in sorted // order by their register number. using livein_iterator = LiveInVector::const_iterator; diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp index 35a9da95b3b9f..ba0b025167307 100644 --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -639,13 +639,6 @@ bool MachineBasicBlock::isLiveIn(MCRegister Reg, LaneBitmask LaneMask) const { return I != livein_end() && (I->LaneMask & LaneMask).any(); } -bool MachineBasicBlock::isLiveOut(MCRegister Reg, LaneBitmask LaneMask) const { - for (MachineBasicBlock *S : successors()) - if (S->isLiveIn(Reg, LaneMask)) - return true; - return false; -} - void MachineBasicBlock::sortUniqueLiveIns() { llvm::sort(LiveIns, [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 99a07876a0189..61ffa484db93d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10766,7 +10766,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { // If SCC is still live, verify that it is not live past the end of this // block. - if (!SCCIsDead && MBB->isLiveOut(AMDGPU::SCC)) + if (!SCCIsDead) return false; // Invert uses diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index c6cc3922d3952..026b8ba2759f0 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -701,10 +701,11 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc + ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc + ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc + ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 1233c1fe12f72..72c2003058a01 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -879,7 +879,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: s_lshl_b32 s6, s4, 7 ; SI-NEXT: s_or_b32 s6, s5, s6 -; SI-NEXT: s_cselect_b32 s4, s5, s4 +; SI-NEXT: s_cmp_eq_u32 s6, 0 +; SI-NEXT: s_cselect_b32 s4, s4, s5 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -892,7 +893,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 -; VI-NEXT: s_cselect_b32 s2, s3, s2 +; VI-NEXT: s_cmp_eq_u32 s4, 0 +; VI-NEXT: s_cselect_b32 s2, s2, s3 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -906,7 +908,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 ; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cselect_b32 s2, s3, s2 +; GFX9-NEXT: s_cmp_eq_u32 s4, 0 +; GFX9-NEXT: s_cselect_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -932,7 +935,8 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 ; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cselect_b32 s2, s3, s2 +; GFX10-NEXT: s_cmp_eq_u32 s4, 0 +; GFX10-NEXT: s_cselect_b32 s2, s2, s3 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -942,9 +946,11 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s4, s3, s4 -; GFX11-NEXT: s_cselect_b32 s2, s3, s2 +; GFX11-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-NEXT: s_cselect_b32 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir index 442bf4f7e6d64..fa452f3717f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir @@ -1476,8 +1476,9 @@ body: | ; GCN-NEXT: liveins: $sgpr0, $vgpr0_vgpr1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def dead $scc + ; GCN-NEXT: S_CMP_EQ_U32 [[S_AND_B32_]], 0, implicit-def $scc + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc ; GCN-NEXT: S_BRANCH %bb.1 ; GCN-NEXT: {{ $}} ; GCN-NEXT: bb.1: diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 6fb1d49e53c45..b01447abe3e9b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -23,34 +23,53 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; s_lshl_b32 sets SCC if result is non-zero. ; Deletion of equal to zero comparison will require inversion of use. +; FIXME: Can't invert because kill flag not set on last use. define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32_eq: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, s1 -; CHECK-NEXT: s_cselect_b64 s[0:1], 0, -1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; CHECK-NEXT: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: s_lshl_b32 s0, s0, 1 +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 +; CHECK-NEXT: s_cselect_b32 s0, s1, 0 ; CHECK-NEXT: ; return to shader part epilog - %result = shl i32 %val0, %val1 + %result = shl i32 %val0, 1 %cmp = icmp eq i32 %result, 0 - %zext = zext i1 %cmp to i32 - ret i32 %zext + %select = select i1 %cmp, i32 %val1, i32 0 + ret i32 %select +} + +; s_lshl_b32 sets SCC if result is non-zero. +; Deletion of equal to zero comparison will require inversion of use. +define amdgpu_ps i32 @shl32_eq_with_scc_clobber(i32 inreg %val0, i32 inreg %val1) { +; CHECK-LABEL: shl32_eq_with_scc_clobber: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_lshl_b32 s0, s0, 1 +; CHECK-NEXT: s_cselect_b32 s0, 0, s1 +; CHECK-NEXT: s_xor_b32 s0, s0, s1 +; CHECK-NEXT: ; return to shader part epilog + %result = shl i32 %val0, 1 + %cmp = icmp eq i32 %result, 0 + %select = select i1 %cmp, i32 %val1, i32 0 + %xor = xor i32 %select, %val1 + ret i32 %xor } ; 64-bit selection will generate two 32-bit selects. Inversion of multiple ; uses is required. -define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) { -; CHECK-LABEL: shl32_eq_multi_use: +define amdgpu_ps i64 @shl32_eq_multi_use_with_scc_clobber(i32 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: shl32_eq_multi_use_with_scc_clobber: ; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s3, s2 +; CHECK-NEXT: s_mov_b32 s2, s1 ; CHECK-NEXT: s_lshl_b32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b32 s2, 0, s2 -; CHECK-NEXT: s_cselect_b32 s0, 0, s1 -; CHECK-NEXT: s_mov_b32 s1, s2 +; CHECK-NEXT: s_cselect_b32 s1, 0, s3 +; CHECK-NEXT: s_cselect_b32 s0, 0, s2 +; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; CHECK-NEXT: ; return to shader part epilog %result = shl i32 %val0, 1 %cmp = icmp eq i32 %result, 0 - %val64 = select i1 %cmp, i64 %val1, i64 0 - ret i64 %val64 + %select = select i1 %cmp, i64 %val1, i64 0 + %xor = xor i64 %select, %val1 + ret i64 %xor } define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { @@ -693,14 +712,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() { ; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12 ; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB38_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB41_2 ; CHECK-NEXT: ; %bb.1: ; %endif ; CHECK-NEXT: s_mov_b32 s0, 1 -; CHECK-NEXT: s_branch .LBB38_3 -; CHECK-NEXT: .LBB38_2: ; %if +; CHECK-NEXT: s_branch .LBB41_3 +; CHECK-NEXT: .LBB41_2: ; %if ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_branch .LBB38_3 -; CHECK-NEXT: .LBB38_3: +; CHECK-NEXT: s_branch .LBB41_3 +; CHECK-NEXT: .LBB41_3: %cmp = icmp ne ptr addrspace(4) @1, null br i1 %cmp, label %endif, label %if diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 4ef2ac1f59e07..4445383bd0ace 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -180,7 +180,8 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1 +; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0 +; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -189,7 +190,8 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1 +; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0 +; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -206,7 +208,9 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1 +; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe +; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0 +; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31] From 35c3de8d0ffaafb220559f5edd82fe89e5a89bc0 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 17 Nov 2025 11:37:57 -0600 Subject: [PATCH 07/10] SCC is dead on return Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- llvm/test/CodeGen/AMDGPU/fshl.ll | 18 ++++++------------ llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 3 +-- .../CodeGen/AMDGPU/workitem-intrinsic-opts.ll | 10 +++------- 4 files changed, 11 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 61ffa484db93d..ace5150d8437f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10758,7 +10758,7 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { return false; } if (MI.definesRegister(AMDGPU::SCC, &RI) || - MI.killsRegister(AMDGPU::SCC, &RI)) { + MI.killsRegister(AMDGPU::SCC, &RI) || MI.isReturn()) { SCCIsDead = true; break; } diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll index 72c2003058a01..1233c1fe12f72 100644 --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -879,8 +879,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: s_lshl_b32 s6, s4, 7 ; SI-NEXT: s_or_b32 s6, s5, s6 -; SI-NEXT: s_cmp_eq_u32 s6, 0 -; SI-NEXT: s_cselect_b32 s4, s4, s5 +; SI-NEXT: s_cselect_b32 s4, s5, s4 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, s4 @@ -893,8 +892,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s4, s2, 7 ; VI-NEXT: s_or_b32 s4, s3, s4 -; VI-NEXT: s_cmp_eq_u32 s4, 0 -; VI-NEXT: s_cselect_b32 s2, s2, s3 +; VI-NEXT: s_cselect_b32 s2, s3, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 @@ -908,8 +906,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b32 s4, s2, 7 ; GFX9-NEXT: s_or_b32 s4, s3, s4 -; GFX9-NEXT: s_cmp_eq_u32 s4, 0 -; GFX9-NEXT: s_cselect_b32 s2, s2, s3 +; GFX9-NEXT: s_cselect_b32 s2, s3, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -935,8 +932,7 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_lshl_b32 s4, s2, 7 ; GFX10-NEXT: s_or_b32 s4, s3, s4 -; GFX10-NEXT: s_cmp_eq_u32 s4, 0 -; GFX10-NEXT: s_cselect_b32 s2, s2, s3 +; GFX10-NEXT: s_cselect_b32 s2, s3, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm @@ -946,11 +942,9 @@ define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) { ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s4, s2, 7 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_or_b32 s4, s3, s4 -; GFX11-NEXT: s_cmp_eq_u32 s4, 0 -; GFX11-NEXT: s_cselect_b32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_cselect_b32 s2, s3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index b01447abe3e9b..14404b5adfd6c 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -28,8 +28,7 @@ define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32_eq: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_lshl_b32 s0, s0, 1 -; CHECK-NEXT: s_cmp_eq_u32 s0, 0 -; CHECK-NEXT: s_cselect_b32 s0, s1, 0 +; CHECK-NEXT: s_cselect_b32 s0, 0, s1 ; CHECK-NEXT: ; return to shader part epilog %result = shl i32 %val0, 1 %cmp = icmp eq i32 %result, 0 diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll index 4445383bd0ace..4ef2ac1f59e07 100644 --- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll +++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll @@ -180,8 +180,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s12, s13 ; DAGISEL-GFX8-NEXT: s_or_b32 s4, s4, s14 -; DAGISEL-GFX8-NEXT: s_cmp_eq_u32 s4, 0 -; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], -1, 0 +; DAGISEL-GFX8-NEXT: s_cselect_b64 s[4:5], 0, -1 ; DAGISEL-GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; DAGISEL-GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -190,8 +189,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s12, s13 ; DAGISEL-GFX942-NEXT: s_or_b32 s0, s0, s14 -; DAGISEL-GFX942-NEXT: s_cmp_eq_u32 s0, 0 -; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; DAGISEL-GFX942-NEXT: s_cselect_b64 s[0:1], 0, -1 ; DAGISEL-GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; DAGISEL-GFX942-NEXT: s_setpc_b64 s[30:31] ; @@ -208,9 +206,7 @@ define i1 @workgroup_zero() { ; DAGISEL-GFX12-NEXT: s_or_b32 s0, ttmp9, s0 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: s_or_b32 s0, s0, s1 -; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe -; DAGISEL-GFX12-NEXT: s_cmp_eq_u32 s0, 0 -; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; DAGISEL-GFX12-NEXT: s_cselect_b32 s0, 0, -1 ; DAGISEL-GFX12-NEXT: s_wait_alu 0xfffe ; DAGISEL-GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; DAGISEL-GFX12-NEXT: s_setpc_b64 s[30:31] From adc2b32653c53a5d03e113173dc6312018b9c6e6 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 17 Nov 2025 12:07:33 -0600 Subject: [PATCH 08/10] Add scc killed during instruction insertion for s_cbranch_scc? --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 11 +++++++++++ llvm/lib/Target/AMDGPU/SOPInstructions.td | 2 +- .../AMDGPU/branch-folding-implicit-def-subreg.ll | 5 ++--- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e37d739fc25df..b213f9c91d507 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6487,6 +6487,17 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MI.getOperand(0).setReg(OriginalExec); return BB; } + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: { + MachineBasicBlock *TBB = nullptr; + MachineBasicBlock *FBB = nullptr; + SmallVector Cond; + TII->analyzeBranch(*BB, TBB, FBB, Cond); + if (TBB && !TBB->isLiveIn(AMDGPU::SCC) && FBB && + !FBB->isLiveIn(AMDGPU::SCC)) + MI.addRegisterKilled(AMDGPU::SCC, TRI); + } + return BB; default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 1931e0be15152..694f39d7b0881 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1548,7 +1548,7 @@ defm S_BRANCH : SOPP_With_Relaxation< [(br bb:$simm16)]>; } -let Uses = [SCC] in { +let usesCustomInserter = 1, Uses = [SCC] in { defm S_CBRANCH_SCC0 : SOPP_With_Relaxation< "s_cbranch_scc0" , (ins SOPPBrTarget:$simm16), "$simm16" diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 026b8ba2759f0..c6cc3922d3952 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -701,11 +701,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.59(0x40000000), %bb.53(0x40000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr56_sgpr57:0x000000000000000F, $sgpr62_sgpr63, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x0000000000000003, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x0000000000000003, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc - ; GFX90A-NEXT: S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc + ; GFX90A-NEXT: dead renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def $scc ; GFX90A-NEXT: renamable $vgpr10 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr11, dead renamable $sgpr18_sgpr19 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: S_CBRANCH_SCC1 %bb.59, implicit killed $scc + ; GFX90A-NEXT: S_CBRANCH_SCC0 %bb.59, implicit killed $scc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.53: ; GFX90A-NEXT: successors: %bb.61(0x80000000) From d01205f661549d259b71c6d650169be7b6253fb6 Mon Sep 17 00:00:00 2001 From: John Lu Date: Mon, 17 Nov 2025 13:20:48 -0600 Subject: [PATCH 09/10] Inversion test does not require scc clobber Signed-off-by: John Lu --- llvm/test/CodeGen/AMDGPU/s_cmp_0.ll | 42 ++++++++--------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll index 14404b5adfd6c..6f4212b13433b 100644 --- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll +++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll @@ -23,7 +23,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) { ; s_lshl_b32 sets SCC if result is non-zero. ; Deletion of equal to zero comparison will require inversion of use. -; FIXME: Can't invert because kill flag not set on last use. define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) { ; CHECK-LABEL: shl32_eq: ; CHECK: ; %bb.0: @@ -36,39 +35,20 @@ define amdgpu_ps i32 @shl32_eq(i32 inreg %val0, i32 inreg %val1) { ret i32 %select } -; s_lshl_b32 sets SCC if result is non-zero. -; Deletion of equal to zero comparison will require inversion of use. -define amdgpu_ps i32 @shl32_eq_with_scc_clobber(i32 inreg %val0, i32 inreg %val1) { -; CHECK-LABEL: shl32_eq_with_scc_clobber: -; CHECK: ; %bb.0: -; CHECK-NEXT: s_lshl_b32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b32 s0, 0, s1 -; CHECK-NEXT: s_xor_b32 s0, s0, s1 -; CHECK-NEXT: ; return to shader part epilog - %result = shl i32 %val0, 1 - %cmp = icmp eq i32 %result, 0 - %select = select i1 %cmp, i32 %val1, i32 0 - %xor = xor i32 %select, %val1 - ret i32 %xor -} - ; 64-bit selection will generate two 32-bit selects. Inversion of multiple ; uses is required. -define amdgpu_ps i64 @shl32_eq_multi_use_with_scc_clobber(i32 inreg %val0, i64 inreg %val1) { -; CHECK-LABEL: shl32_eq_multi_use_with_scc_clobber: +define amdgpu_ps i64 @shl32_eq_multi_use(i32 inreg %val0, i64 inreg %val1) { +; CHECK-LABEL: shl32_eq_multi_use: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s3, s2 -; CHECK-NEXT: s_mov_b32 s2, s1 ; CHECK-NEXT: s_lshl_b32 s0, s0, 1 -; CHECK-NEXT: s_cselect_b32 s1, 0, s3 -; CHECK-NEXT: s_cselect_b32 s0, 0, s2 -; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; CHECK-NEXT: s_cselect_b32 s2, 0, s2 +; CHECK-NEXT: s_cselect_b32 s0, 0, s1 +; CHECK-NEXT: s_mov_b32 s1, s2 ; CHECK-NEXT: ; return to shader part epilog %result = shl i32 %val0, 1 %cmp = icmp eq i32 %result, 0 %select = select i1 %cmp, i64 %val1, i64 0 - %xor = xor i64 %select, %val1 - ret i64 %xor + ret i64 %select } define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) { @@ -711,14 +691,14 @@ define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() { ; CHECK-NEXT: s_add_u32 s0, s0, __unnamed_1@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12 ; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 -; CHECK-NEXT: s_cbranch_scc0 .LBB41_2 +; CHECK-NEXT: s_cbranch_scc0 .LBB40_2 ; CHECK-NEXT: ; %bb.1: ; %endif ; CHECK-NEXT: s_mov_b32 s0, 1 -; CHECK-NEXT: s_branch .LBB41_3 -; CHECK-NEXT: .LBB41_2: ; %if +; CHECK-NEXT: s_branch .LBB40_3 +; CHECK-NEXT: .LBB40_2: ; %if ; CHECK-NEXT: s_mov_b32 s0, 0 -; CHECK-NEXT: s_branch .LBB41_3 -; CHECK-NEXT: .LBB41_3: +; CHECK-NEXT: s_branch .LBB40_3 +; CHECK-NEXT: .LBB40_3: %cmp = icmp ne ptr addrspace(4) @1, null br i1 %cmp, label %endif, label %if From 729b232f6b6273c7734bc014b561a619b5aaead6 Mon Sep 17 00:00:00 2001 From: John Lu Date: Wed, 19 Nov 2025 09:40:39 -0600 Subject: [PATCH 10/10] Update comment. Cleaner return check Signed-off-by: John Lu --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b213f9c91d507..865eb61e481f5 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6496,8 +6496,8 @@ SITargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, if (TBB && !TBB->isLiveIn(AMDGPU::SCC) && FBB && !FBB->isLiveIn(AMDGPU::SCC)) MI.addRegisterKilled(AMDGPU::SCC, TRI); - } return BB; + } default: if (TII->isImage(MI) || TII->isMUBUF(MI)) { if (!MI.mayStore()) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ace5150d8437f..c2786b49c36bd 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10758,14 +10758,15 @@ bool SIInstrInfo::invertSCCUse(MachineInstr *SCCDef) const { return false; } if (MI.definesRegister(AMDGPU::SCC, &RI) || - MI.killsRegister(AMDGPU::SCC, &RI) || MI.isReturn()) { + MI.killsRegister(AMDGPU::SCC, &RI)) { SCCIsDead = true; break; } } + if (MBB->succ_empty()) + SCCIsDead = true; - // If SCC is still live, verify that it is not live past the end of this - // block. + // SCC may have more uses. Can't invert all of them. if (!SCCIsDead) return false;