Skip to content
38 changes: 33 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10160,7 +10160,7 @@ static bool followSubRegDef(MachineInstr &MI,
}

MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI) {
const MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
if (!P.Reg.isVirtual())
return nullptr;
Expand Down Expand Up @@ -10625,6 +10625,8 @@ bool SIInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
static bool optimizeSCC(MachineInstr *SCCValid, MachineInstr *SCCRedefine,
const SIRegisterInfo &RI) {
MachineInstr *KillsSCC = nullptr;
if (SCCValid->getParent() != SCCRedefine->getParent())
return false;
for (MachineInstr &MI : make_range(std::next(SCCValid->getIterator()),
SCCRedefine->getIterator())) {
if (MI.modifiesRegister(AMDGPU::SCC, &RI))
Expand Down Expand Up @@ -10669,8 +10671,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (CmpValue != 0)
return false;

MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
MachineInstr *Def = MRI->getVRegDef(SrcReg);
if (!Def)
return false;

// For S_OP that set SCC = DST!=0, do the transformation
Expand All @@ -10689,6 +10691,32 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
if (!optimizeSCC(Def, &CmpInstr, RI))
return false;

// If s_or_b32 result, sY, is unused (i.e. it is effectively a 64-bit
// s_cmp_lg of a register pair) and the inputs are the hi and lo-halves of a
// 64-bit foldableSelect then delete s_or_b32 in the sequence:
// sX = s_cselect_b64 (non-zero imm), 0
// sLo = copy sX.sub0
// sHi = copy sX.sub1
// sY = s_or_b32 sLo, sHi
Copy link
Contributor

@shiltian shiltian Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pattern is very specific. If sY is dead, won't all its operands be dead automatically (if there is no other uses)? Why do we need to have a special handling here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s_or_b32 may still be alive because it defines scc. This optimization ensures that the scc def is redundant because it is already available after the s_cselect_b64.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is intended to deal with redundant s_or_b32s that are generated when lowering 64-bit add/sub on R600 targets. R600 does not have a 64-bit s_cmp, so the s_or_b32 is used in the lowering.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See llvm/test/CodeGen/AMDGPU/carryout-selection.ll for an example of why this optimization is needed.

if (Def->getOpcode() == AMDGPU::S_OR_B32 &&
MRI->use_nodbg_empty(Def->getOperand(0).getReg())) {
const MachineOperand &OrOpnd1 = Def->getOperand(1);
const MachineOperand &OrOpnd2 = Def->getOperand(2);
if (OrOpnd1.isReg() && OrOpnd2.isReg()) {
MachineInstr *Def1 = MRI->getVRegDef(OrOpnd1.getReg());
MachineInstr *Def2 = MRI->getVRegDef(OrOpnd2.getReg());
if (Def1 && Def1->getOpcode() == AMDGPU::COPY && Def2 &&
Def2->getOpcode() == AMDGPU::COPY && Def1->getOperand(1).isReg() &&
Def2->getOperand(1).isReg() &&
Def1->getOperand(1).getSubReg() == AMDGPU::sub0 &&
Def2->getOperand(1).getSubReg() == AMDGPU::sub1 &&
Def1->getOperand(1).getReg() == Def2->getOperand(1).getReg()) {
MachineInstr *Select = MRI->getVRegDef(Def1->getOperand(1).getReg());
if (Select && foldableSelect(*Select))
optimizeSCC(Select, Def, RI);
}
}
}
return true;
};

Expand Down Expand Up @@ -10718,8 +10746,8 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
// s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
// s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n

MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
if (!Def || Def->getParent() != CmpInstr.getParent())
MachineInstr *Def = MRI->getVRegDef(SrcReg);
if (!Def)
return false;

if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1687,7 +1687,7 @@ TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI,
/// skipping copy like instructions and subreg-manipulation pseudos.
/// Following another subreg of a reg:subreg isn't supported.
MachineInstr *getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI);
const MachineRegisterInfo &MRI);

/// \brief Return false if EXEC is not changed between the def of \p VReg at \p
/// DefMI and the use at \p UseMI. Should be run on SSA. Currently does not
Expand Down
Loading