Skip to content

Commit 34c098b

Browse files
committed
[ARM] Prevent spilling between ldrex/strex pairs
Based on the same for AArch64: 4751cad At -O0, the fast register allocator may insert spills between the ldrex and strex instructions inserted by AtomicExpandPass when expanding atomicrmw instructions in LL/SC loops. To avoid this, expand to cmpxchg loops and therefore expand the cmpxchg pseudos after register allocation. Required a tweak to ARMExpandPseudo::ExpandCMP_SWAP to use the 4-byte encoding of UXT, since the pseudo instruction can be allocated a high register (R8-R15) which the 2-byte encoding doesn't support. However, the 4-byte encodings are not present for ARM v8-M Baseline. To enable this, two new pseudos are added for Thumb which are only valid for v8mbase, tCMP_SWAP_8 and tCMP_SWAP_16. The previously committed attempt in D101164 had to be reverted due to runtime failures in the test suites. Rather than spending time fixing that implementation (adding another implementation of atomic operations and more divergence between backends) I have chosen to follow the approach taken in D101163. Differential Revision: https://reviews.llvm.org/D101898 Depends on D101912
1 parent edf9d88 commit 34c098b

File tree

5 files changed

+107
-69
lines changed

5 files changed

+107
-69
lines changed

llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1566,6 +1566,15 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
15661566
Register DesiredReg = MI.getOperand(3).getReg();
15671567
Register NewReg = MI.getOperand(4).getReg();
15681568

1569+
if (IsThumb) {
1570+
assert(STI->hasV8MBaselineOps() &&
1571+
"CMP_SWAP not expected to be custom expanded for Thumb1");
1572+
assert((UxtOp == 0 || UxtOp == ARM::tUXTB || UxtOp == ARM::tUXTH) &&
1573+
"ARMv8-M.baseline does not have t2UXTB/t2UXTH");
1574+
assert(ARM::tGPRRegClass.contains(DesiredReg) &&
1575+
"DesiredReg used for UXT op must be tGPR");
1576+
}
1577+
15691578
MachineFunction *MF = MBB.getParent();
15701579
auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
15711580
auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -2779,20 +2788,23 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
27792788
case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
27802789
case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
27812790

2791+
case ARM::tCMP_SWAP_8:
2792+
assert(STI->isThumb());
2793+
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB, ARM::tUXTB,
2794+
NextMBBI);
2795+
case ARM::tCMP_SWAP_16:
2796+
assert(STI->isThumb());
2797+
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH,
2798+
NextMBBI);
2799+
27822800
case ARM::CMP_SWAP_8:
2783-
if (STI->isThumb())
2784-
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXB, ARM::t2STREXB,
2785-
ARM::tUXTB, NextMBBI);
2786-
else
2787-
return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB,
2788-
ARM::UXTB, NextMBBI);
2801+
assert(!STI->isThumb());
2802+
return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXB, ARM::STREXB, ARM::UXTB,
2803+
NextMBBI);
27892804
case ARM::CMP_SWAP_16:
2790-
if (STI->isThumb())
2791-
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH,
2792-
ARM::tUXTH, NextMBBI);
2793-
else
2794-
return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH,
2795-
ARM::UXTH, NextMBBI);
2805+
assert(!STI->isThumb());
2806+
return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH,
2807+
NextMBBI);
27962808
case ARM::CMP_SWAP_32:
27972809
if (STI->isThumb())
27982810
return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,

llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3299,9 +3299,9 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
32993299
unsigned Opcode;
33003300
EVT MemTy = cast<MemSDNode>(N)->getMemoryVT();
33013301
if (MemTy == MVT::i8)
3302-
Opcode = ARM::CMP_SWAP_8;
3302+
Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_8 : ARM::CMP_SWAP_8;
33033303
else if (MemTy == MVT::i16)
3304-
Opcode = ARM::CMP_SWAP_16;
3304+
Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16;
33053305
else if (MemTy == MVT::i32)
33063306
Opcode = ARM::CMP_SWAP_32;
33073307
else

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19359,6 +19359,14 @@ ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
1935919359
if (AI->isFloatingPointOperation())
1936019360
return AtomicExpansionKind::CmpXChg;
1936119361

19362+
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
19363+
// implement atomicrmw without spilling. If the target address is also on the
19364+
// stack and close enough to the spill slot, this can lead to a situation
19365+
// where the monitor always gets cleared and the atomic operation can never
19366+
// succeed. So at -O0 lower this operation to a CAS loop.
19367+
if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
19368+
return AtomicExpansionKind::CmpXChg;
19369+
1936219370
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
1936319371
bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
1936419372
return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)

llvm/lib/Target/ARM/ARMInstrThumb.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1766,3 +1766,21 @@ def : tInstAlias<"asr${s}${p} $Rdm, $imm",
17661766
def tLDRConstPool
17671767
: tAsmPseudo<"ldr${p} $Rt, $immediate",
17681768
(ins tGPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>;
1769+
1770+
//===----------------------------------
1771+
// Atomic cmpxchg for -O0
1772+
//===----------------------------------
1773+
1774+
// See ARMInstrInfo.td. These two thumb specific pseudos are required to
1775+
// restrict the register class for the UXTB/UXTH ops used in the expansion.
1776+
1777+
let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
1778+
mayLoad = 1, mayStore = 1 in {
1779+
def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
1780+
(ins GPR:$addr, tGPR:$desired, GPR:$new),
1781+
NoItinerary, []>, Sched<[]>;
1782+
1783+
def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
1784+
(ins GPR:$addr, tGPR:$desired, GPR:$new),
1785+
NoItinerary, []>, Sched<[]>;
1786+
}

0 commit comments

Comments
 (0)