Skip to content

Commit

Permalink
[SystemZ] Enable MachineCombiner for FP reassociation (#83546)
Browse files Browse the repository at this point in the history
Enable MachineCombining for FP add, sub and mul.

In order for this to work, the default instruction selection of reg/mem opcodes is disabled for ISD nodes that carry the flags that allow reassociation. The reg/mem folding is instead done after MachineCombiner by PeepholeOptimizer. SystemZInstrInfo optimizeLoadInstr() and foldMemoryOperandImpl() ("LoadMI version") have been implemented for this purpose also by this patch.
  • Loading branch information
JonPsson1 committed Apr 30, 2024
1 parent 622ec1f commit 6c32a1f
Show file tree
Hide file tree
Showing 17 changed files with 1,074 additions and 23 deletions.
14 changes: 14 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,11 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
// Try to expand a boolean SELECT_CCMASK using an IPM sequence.
SDValue expandSelectBoolean(SDNode *Node);

// Return true if the flags of N and the subtarget allows for
// reassociation, in which case a reg/reg opcode is needed as input to the
// MachineCombiner.
bool shouldSelectForReassoc(SDNode *N) const;

public:
static char ID;

Expand Down Expand Up @@ -2044,6 +2049,15 @@ SDValue SystemZDAGToDAGISel::expandSelectBoolean(SDNode *Node) {
return Result;
}

bool SystemZDAGToDAGISel::shouldSelectForReassoc(SDNode *N) const {
EVT VT = N->getValueType(0);
assert(VT.isFloatingPoint() && "Expected FP SDNode");
return N->getFlags().hasAllowReassociation() &&
N->getFlags().hasNoSignedZeros() && Subtarget->hasVector() &&
(VT != MVT::f32 || Subtarget->hasVectorEnhancements1()) &&
!N->isStrictFPOpcode();
}

void SystemZDAGToDAGISel::PreprocessISelDAG() {
// If we have conditional immediate loads, we always prefer
// using those over an IPM sequence.
Expand Down
18 changes: 12 additions & 6 deletions llvm/lib/Target/SystemZ/SystemZInstrFP.td
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
}
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, z_load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, z_load, 8>;
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32,
z_load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64,
z_load, 8>;
}

// Subtraction.
Expand All @@ -441,8 +443,10 @@ let Uses = [FPC], mayRaiseFPException = 1,
def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;

defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, z_load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, z_load, 8>;
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32,
z_load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64,
z_load, 8>;
}

// Multiplication.
Expand All @@ -452,8 +456,10 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
}
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, z_load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, z_load, 8>;
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32,
z_load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64,
z_load, 8>;
}

// f64 multiplication of two FP32 registers.
Expand Down
165 changes: 164 additions & 1 deletion llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,32 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
.addImm(CCValid).addImm(CCMask);
}

MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move the DefMI load, and that it only has one use.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
if (!DefMI->isSafeToMove(nullptr, SawStore) ||
!MRI->hasOneNonDBGUse(FoldAsLoadDefReg))
return nullptr;

int UseOpIdx =
MI.findRegisterUseOperandIdx(FoldAsLoadDefReg, /*TRI=*/nullptr);
assert(UseOpIdx != -1 && "Expected FoldAsLoadDefReg to be used by MI.");

// Check whether we can fold the load.
if (MachineInstr *FoldMI =
foldMemoryOperand(MI, {((unsigned)UseOpIdx)}, *DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}

return nullptr;
}

bool SystemZInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
Register Reg,
MachineRegisterInfo *MRI) const {
Expand Down Expand Up @@ -1004,6 +1030,67 @@ SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
return nullptr;
}

bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const {
unsigned Opc = Inst.getOpcode();
if (Invert) {
auto InverseOpcode = getInverseOpcode(Opc);
if (!InverseOpcode)
return false;
Opc = *InverseOpcode;
}

switch (Opc) {
default:
break;
// Adds and multiplications.
case SystemZ::WFADB:
case SystemZ::WFASB:
case SystemZ::WFAXB:
case SystemZ::VFADB:
case SystemZ::VFASB:
case SystemZ::WFMDB:
case SystemZ::WFMSB:
case SystemZ::WFMXB:
case SystemZ::VFMDB:
case SystemZ::VFMSB:
return (Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
Inst.getFlag(MachineInstr::MIFlag::FmNsz));
}

return false;
}

std::optional<unsigned>
SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const {
// fadd => fsub
switch (Opcode) {
case SystemZ::WFADB:
return SystemZ::WFSDB;
case SystemZ::WFASB:
return SystemZ::WFSSB;
case SystemZ::WFAXB:
return SystemZ::WFSXB;
case SystemZ::VFADB:
return SystemZ::VFSDB;
case SystemZ::VFASB:
return SystemZ::VFSSB;
// fsub => fadd
case SystemZ::WFSDB:
return SystemZ::WFADB;
case SystemZ::WFSSB:
return SystemZ::WFASB;
case SystemZ::WFSXB:
return SystemZ::WFAXB;
case SystemZ::VFSDB:
return SystemZ::VFADB;
case SystemZ::VFSSB:
return SystemZ::VFASB;
default:
return std::nullopt;
}
}

MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
Expand Down Expand Up @@ -1338,7 +1425,83 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
return nullptr;
MachineRegisterInfo *MRI = &MF.getRegInfo();
MachineBasicBlock *MBB = MI.getParent();

// For reassociable FP operations, any loads have been purposefully left
// unfolded so that MachineCombiner can do its work on reg/reg
// opcodes. After that, as many loads as possible are now folded.
// TODO: This may be beneficial with other opcodes as well as machine-sink
// can move loads close to their user in a different MBB, which the isel
// matcher did not see.
unsigned LoadOpc = 0;
unsigned RegMemOpcode = 0;
const TargetRegisterClass *FPRC = nullptr;
RegMemOpcode = MI.getOpcode() == SystemZ::WFADB ? SystemZ::ADB
: MI.getOpcode() == SystemZ::WFSDB ? SystemZ::SDB
: MI.getOpcode() == SystemZ::WFMDB ? SystemZ::MDB
: 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL64;
FPRC = &SystemZ::FP64BitRegClass;
} else {
RegMemOpcode = MI.getOpcode() == SystemZ::WFASB ? SystemZ::AEB
: MI.getOpcode() == SystemZ::WFSSB ? SystemZ::SEB
: MI.getOpcode() == SystemZ::WFMSB ? SystemZ::MEEB
: 0;
if (RegMemOpcode) {
LoadOpc = SystemZ::VL32;
FPRC = &SystemZ::FP32BitRegClass;
}
}
if (!RegMemOpcode || LoadMI.getOpcode() != LoadOpc)
return nullptr;

// If RegMemOpcode clobbers CC, first make sure CC is not live at this point.
if (get(RegMemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)) {
assert(LoadMI.getParent() == MI.getParent() && "Assuming a local fold.");
assert(LoadMI != InsertPt && "Assuming InsertPt not to be first in MBB.");
for (MachineBasicBlock::iterator MII = std::prev(InsertPt);;
--MII) {
if (MII->definesRegister(SystemZ::CC, /*TRI=*/nullptr)) {
if (!MII->registerDefIsDead(SystemZ::CC, /*TRI=*/nullptr))
return nullptr;
break;
}
if (MII == MBB->begin()) {
if (MBB->isLiveIn(SystemZ::CC))
return nullptr;
break;
}
}
}

Register FoldAsLoadDefReg = LoadMI.getOperand(0).getReg();
if (Ops.size() != 1 || FoldAsLoadDefReg != MI.getOperand(Ops[0]).getReg())
return nullptr;
Register DstReg = MI.getOperand(0).getReg();
MachineOperand LHS = MI.getOperand(1);
MachineOperand RHS = MI.getOperand(2);
MachineOperand &RegMO = RHS.getReg() == FoldAsLoadDefReg ? LHS : RHS;
if ((RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB) &&
FoldAsLoadDefReg != RHS.getReg())
return nullptr;

MachineOperand &Base = LoadMI.getOperand(1);
MachineOperand &Disp = LoadMI.getOperand(2);
MachineOperand &Indx = LoadMI.getOperand(3);
MachineInstrBuilder MIB =
BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(RegMemOpcode), DstReg)
.add(RegMO)
.add(Base)
.add(Disp)
.add(Indx);
MIB->addRegisterDead(SystemZ::CC, &RI);
MRI->setRegClass(DstReg, FPRC);
MRI->setRegClass(RegMO.getReg(), FPRC);
transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);

return MIB;
}

bool SystemZInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
const DebugLoc &DL, Register DstReg,
ArrayRef<MachineOperand> Cond, Register TrueReg,
Register FalseReg) const override;
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
Register &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
MachineRegisterInfo *MRI) const override;

bool isPredicable(const MachineInstr &MI) const override;
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
Expand Down Expand Up @@ -285,6 +290,12 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
Register VReg) const override;
MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV,
LiveIntervals *LIS) const override;

bool useMachineCombiner() const override { return true; }
bool isAssociativeAndCommutative(const MachineInstr &Inst,
bool Invert) const override;
std::optional<unsigned> getInverseOpcode(unsigned Opcode) const override;

MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/SystemZ/SystemZInstrVector.td
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ let Predicates = [FeatureVector] in {
// LEY and LDY offer full 20-bit displacement fields. It's often better
// to use those instructions rather than force a 20-bit displacement
// into a GPR temporary.
let mayLoad = 1 in {
let mayLoad = 1, canFoldAsLoad = 1 in {
def VL32 : UnaryAliasVRX<z_load, v32sb, bdxaddr12pair>;
def VL64 : UnaryAliasVRX<z_load, v64db, bdxaddr12pair>;
}
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZOperators.td
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,18 @@ def any_fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3),
// Floating-point negative absolute.
def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;

// Floating-point operations which will not participate in reassociation, and
// therefore are candidates for reg/mem folding during isel.
def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fadd node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fsub node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;
def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2),
(any_fmul node:$src1, node:$src2),
[{ return !shouldSelectForReassoc(N); }]>;

// Strict floating-point fragments.
def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
[(z_strict_fcmp node:$lhs, node:$rhs),
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@

using namespace llvm;

static cl::opt<bool> EnableMachineCombinerPass(
"systemz-machine-combiner",
cl::desc("Enable the machine combiner pass"),
cl::init(true), cl::Hidden);

// NOLINTNEXTLINE(readability-identifier-naming)
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
// Register the target.
Expand Down Expand Up @@ -245,6 +250,10 @@ bool SystemZPassConfig::addInstSelector() {

bool SystemZPassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);

if (EnableMachineCombinerPass)
addPass(&MachineCombinerID);

return true;
}

Expand Down
26 changes: 13 additions & 13 deletions llvm/test/CodeGen/SystemZ/anyregcc.ll
Original file line number Diff line number Diff line change
Expand Up @@ -323,37 +323,37 @@ entry:
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 13
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 9: Register
; CHECK-NEXT: .byte 1
; Loc 9: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 10: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 344
; Loc 10: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 11: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 352
; Loc 11: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; Loc 12: Register
; CHECK-NEXT: .byte 1
; CHECK-NEXT: .long 360
; Loc 12: IndirectMem
; CHECK-NEXT: .byte 3
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .short 8
; CHECK-NEXT: .short {{[0-9]+}}
; CHECK-NEXT: .short 0
; CHECK-NEXT: .long 0
; CHECK-NEXT: .long 368
define i64 @anyreg_test2(ptr %a1, ptr %a2, ptr %a3, ptr %a4, ptr %a5, ptr %a6, ptr %a7, ptr %a8, ptr %a9, ptr %a10, ptr %a11, ptr %a12) nounwind ssp uwtable {
entry:
%f = inttoptr i64 12297829382473034410 to ptr
Expand Down

0 comments on commit 6c32a1f

Please sign in to comment.