Skip to content

Commit

Permalink
[SystemZ] Improve foldMemoryOperandImpl: vec->FP conversions
Browse files Browse the repository at this point in the history
Use FP-mem instructions when folding reloads into single lane (W..) vector
instructions.

Only do this when all other operands of the instruction have already been
allocated to an FP (F0-F15) register.

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D76705
  • Loading branch information
JonPsson committed May 12, 2020
1 parent 42c7a6d commit 57feff9
Show file tree
Hide file tree
Showing 10 changed files with 2,370 additions and 65 deletions.
24 changes: 12 additions & 12 deletions llvm/lib/Target/SystemZ/SystemZInstrFP.td
Expand Up @@ -438,8 +438,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>;
def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
}
def AEB : BinaryRXE<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
def ADB : BinaryRXE<"adb", 0xED1A, any_fadd, FP64, load, 8>;
defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>;
}

// Subtraction.
Expand All @@ -449,8 +449,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>;
def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;

def SEB : BinaryRXE<"seb", 0xED0B, any_fsub, FP32, load, 4>;
def SDB : BinaryRXE<"sdb", 0xED1B, any_fsub, FP64, load, 8>;
defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, load, 4>;
defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, load, 8>;
}

// Multiplication.
Expand All @@ -460,8 +460,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>;
def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>;
}
def MEEB : BinaryRXE<"meeb", 0xED17, any_fmul, FP32, load, 4>;
def MDB : BinaryRXE<"mdb", 0xED1C, any_fmul, FP64, load, 8>;
defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>;
defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, load, 8>;
}

// f64 multiplication of two FP32 registers.
Expand Down Expand Up @@ -503,17 +503,17 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;

def MAEB : TernaryRXF<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
def MADB : TernaryRXF<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
}

// Fused multiply-subtract.
let Uses = [FPC], mayRaiseFPException = 1 in {
def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;

def MSEB : TernaryRXF<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
def MSDB : TernaryRXF<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
}

// Division.
Expand All @@ -522,8 +522,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64, FP64>;
def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;

def DEB : BinaryRXE<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
def DDB : BinaryRXE<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
}

// Divide to integer.
Expand Down
59 changes: 54 additions & 5 deletions llvm/lib/Target/SystemZ/SystemZInstrFormats.td
Expand Up @@ -2911,13 +2911,15 @@ class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, ImmOpWithPattern imm>

class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0,
bits<4> m5 = 0>
bits<4> m5 = 0, string fp_mnemonic = "">
: InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
mnemonic#"\t$V1, $V2",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]> {
let M3 = type;
let M4 = m4;
let M5 = m5;
let OpKey = fp_mnemonic#!subst("VR", "FP", !cast<string>(tr1.op));
let OpType = "reg";
}

class UnaryVRRaGeneric<string mnemonic, bits<16> opcode, bits<4> m4 = 0,
Expand Down Expand Up @@ -3627,14 +3629,16 @@ multiclass BinaryExtraVRRbSPairGeneric<string mnemonic, bits<16> opcode> {

class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0,
bits<4> m6 = 0>
bits<4> m6 = 0, string fp_mnemonic = "">
: InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
mnemonic#"\t$V1, $V2, $V3",
[(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
(tr2.vt tr2.op:$V3)))]> {
let M4 = type;
let M5 = m5;
let M6 = m6;
let OpKey = fp_mnemonic#"MemFold"#!subst("VR", "FP", !cast<string>(tr1.op));
let OpType = "reg";
}

class BinaryVRRcGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0,
Expand Down Expand Up @@ -3986,14 +3990,16 @@ multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
}

class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr, bits<4> type>
TypedReg tr, bits<4> type, string fp_mnemonic = "">
: InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
mnemonic#"\t$V1, $V2",
[(set CC, (operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2)))]> {
let isCompare = 1;
let M3 = type;
let M4 = 0;
let M5 = 0;
let OpKey = fp_mnemonic#!subst("VR", "FP", !cast<string>(tr.op));
let OpType = "reg";
}

class CompareVRRaGeneric<string mnemonic, bits<16> opcode>
Expand Down Expand Up @@ -4407,7 +4413,8 @@ multiclass TernaryExtraVRRdGeneric<string mnemonic, bits<16> opcode> {
}

class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0,
string fp_mnemonic = "">
: InstVRRe<opcode, (outs tr1.op:$V1),
(ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
mnemonic#"\t$V1, $V2, $V3, $V4",
Expand All @@ -4416,6 +4423,8 @@ class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
(tr1.vt tr1.op:$V4)))]> {
let M5 = m5;
let M6 = type;
let OpKey = fp_mnemonic#"MemFold"#!subst("VR", "FP", !cast<string>(tr1.op));
let OpType = "reg";
}

class TernaryVRReFloatGeneric<string mnemonic, bits<16> opcode>
Expand Down Expand Up @@ -4785,6 +4794,26 @@ class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
let hasNoSchedulingInfo = 1;
}

// Same as MemFoldPseudo but for mapping a W... vector instruction
class MemFoldPseudo_FP<string mnemonic, RegisterOperand cls, bits<5> bytes,
AddressingMode mode>
: MemFoldPseudo<mnemonic, cls, bytes, mode> {
let OpKey = mnemonic#"r"#"MemFold"#cls;
}

class MemFoldPseudo_FPTern<string mnemonic, RegisterOperand cls, bits<5> bytes,
AddressingMode mode>
: Pseudo<(outs cls:$R1), (ins cls:$R2, cls:$R3, mode:$XBD2), []> {
let OpKey = mnemonic#"r"#"MemFold"#cls;
let OpType = "mem";
let MemKey = mnemonic#cls;
let MemType = "pseudo";
let mayLoad = 1;
let AccessBytes = bytes;
let HasIndex = 1;
let hasNoSchedulingInfo = 1;
}

// Same as MemFoldPseudo but for Load On Condition with CC operands.
class MemFoldPseudo_CondMove<string mnemonic, RegisterOperand cls, bits<5> bytes,
AddressingMode mode>
Expand Down Expand Up @@ -5072,7 +5101,6 @@ multiclass BinaryRXYAndPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator, RegisterOperand cls,
SDPatternOperator load, bits<5> bytes,
AddressingMode mode = bdxaddr20only> {

def "" : BinaryRXY<mnemonic, opcode, operator, cls, load, bytes, mode> {
let MemKey = mnemonic#cls;
let MemType = "target";
Expand All @@ -5099,6 +5127,27 @@ multiclass BinaryRXPairAndPseudo<string mnemonic, bits<8> rxOpcode,
def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, bdxaddr12pair>;
}

multiclass BinaryRXEAndPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator, RegisterOperand cls,
SDPatternOperator load, bits<5> bytes> {
def "" : BinaryRXE<mnemonic, opcode, operator, cls, load, bytes> {
let MemKey = mnemonic#cls;
let MemType = "target";
}
def _MemFoldPseudo : MemFoldPseudo_FP<mnemonic, cls, bytes, bdxaddr12pair>;
}

multiclass TernaryRXFAndPseudo<string mnemonic, bits<16> opcode,
SDPatternOperator operator, RegisterOperand cls1,
RegisterOperand cls2, SDPatternOperator load,
bits<5> bytes> {
def "" : TernaryRXF<mnemonic, opcode, operator, cls1, cls2, load, bytes> {
let MemKey = mnemonic#cls1;
let MemType = "target";
}
def _MemFoldPseudo : MemFoldPseudo_FPTern<mnemonic, cls1, bytes, bdxaddr12pair>;
}

multiclass CondUnaryRSYPairAndMemFold<string mnemonic, bits<16> opcode,
SDPatternOperator operator,
RegisterOperand cls, bits<5> bytes,
Expand Down
114 changes: 89 additions & 25 deletions llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
Expand Up @@ -993,33 +993,36 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MachineBasicBlock::iterator InsertPt, int FrameIndex,
LiveIntervals *LIS, VirtRegMap *VRM) const {
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Size = MFI.getObjectSize(FrameIndex);
unsigned Opcode = MI.getOpcode();

// Check CC liveness if new instruction introduces a dead def of CC.
MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
SlotIndex MISlot = SlotIndex();
LiveRange *CCLiveRange = nullptr;
bool CCLiveAtMI = true;
if (LIS) {
MISlot = LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
CCLiveRange = &LIS->getRegUnit(*CCUnit);
CCLiveAtMI = CCLiveRange->liveAt(MISlot);
}
++CCUnit;
assert(!CCUnit.isValid() && "CC only has one reg unit.");

if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
if (LIS != nullptr && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
if (!CCLiveAtMI && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
isInt<8>(MI.getOperand(2).getImm()) && !MI.getOperand(3).getReg()) {

// Check CC liveness, since new instruction introduces a dead
// def of CC.
MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
++CCUnit;
assert(!CCUnit.isValid() && "CC only has one reg unit.");
SlotIndex MISlot =
LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
if (!CCLiveRange.liveAt(MISlot)) {
// LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
MI.getDebugLoc(), get(SystemZ::AGSI))
.addFrameIndex(FrameIndex)
.addImm(0)
.addImm(MI.getOperand(2).getImm());
BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
CCLiveRange.createDeadDef(MISlot, LIS->getVNInfoAllocator());
return BuiltMI;
}
// LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
MI.getDebugLoc(), get(SystemZ::AGSI))
.addFrameIndex(FrameIndex)
.addImm(0)
.addImm(MI.getOperand(2).getImm());
BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
CCLiveRange->createDeadDef(MISlot, LIS->getVNInfoAllocator());
return BuiltMI;
}
return nullptr;
}
Expand Down Expand Up @@ -1173,16 +1176,51 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
}

// If the spilled operand is the final one or the instruction is
// commutable, try to change <INSN>R into <INSN>.
// commutable, try to change <INSN>R into <INSN>. Don't introduce a def of
// CC if it is live and MI does not define it.
unsigned NumOps = MI.getNumExplicitOperands();
int MemOpcode = SystemZ::getMemOpcode(Opcode);
if (MemOpcode == -1)
if (MemOpcode == -1 ||
(CCLiveAtMI && !MI.definesRegister(SystemZ::CC) &&
get(MemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)))
return nullptr;

// Check if all other vregs have a usable allocation in the case of vector
// to FP conversion.
const MCInstrDesc &MCID = MI.getDesc();
for (unsigned I = 0, E = MCID.getNumOperands(); I != E; ++I) {
const MCOperandInfo &MCOI = MCID.OpInfo[I];
if (MCOI.OperandType != MCOI::OPERAND_REGISTER || I == OpNum)
continue;
const TargetRegisterClass *RC = TRI->getRegClass(MCOI.RegClass);
if (RC == &SystemZ::VR32BitRegClass || RC == &SystemZ::VR64BitRegClass) {
Register Reg = MI.getOperand(I).getReg();
Register PhysReg = Register::isVirtualRegister(Reg)
? (VRM ? VRM->getPhys(Reg) : Register())
: Reg;
if (!PhysReg ||
!(SystemZ::FP32BitRegClass.contains(PhysReg) ||
SystemZ::FP64BitRegClass.contains(PhysReg) ||
SystemZ::VF128BitRegClass.contains(PhysReg)))
return nullptr;
}
}
// Fused multiply and add/sub need to have the same dst and accumulator reg.
bool FusedFPOp = (Opcode == SystemZ::WFMADB || Opcode == SystemZ::WFMASB ||
Opcode == SystemZ::WFMSDB || Opcode == SystemZ::WFMSSB);
if (FusedFPOp) {
Register DstReg = VRM->getPhys(MI.getOperand(0).getReg());
Register AccReg = VRM->getPhys(MI.getOperand(3).getReg());
if (OpNum == 0 || OpNum == 3 || DstReg != AccReg)
return nullptr;
}

// Try to swap compare operands if possible.
bool NeedsCommute = false;
if ((MI.getOpcode() == SystemZ::CR || MI.getOpcode() == SystemZ::CGR ||
MI.getOpcode() == SystemZ::CLR || MI.getOpcode() == SystemZ::CLGR) &&
MI.getOpcode() == SystemZ::CLR || MI.getOpcode() == SystemZ::CLGR ||
MI.getOpcode() == SystemZ::WFCDB || MI.getOpcode() == SystemZ::WFCSB ||
MI.getOpcode() == SystemZ::WFKDB || MI.getOpcode() == SystemZ::WFKSB) &&
OpNum == 0 && prepareCompareSwapOperands(MI))
NeedsCommute = true;

Expand Down Expand Up @@ -1218,7 +1256,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
}
}

if ((OpNum == NumOps - 1) || NeedsCommute) {
if ((OpNum == NumOps - 1) || NeedsCommute || FusedFPOp) {
const MCInstrDesc &MemDesc = get(MemOpcode);
uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
assert(AccessBytes != 0 && "Size of access should be known");
Expand All @@ -1230,6 +1268,11 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
assert(NumOps == 2 && "Expected 2 register operands for a compare.");
MIB.add(MI.getOperand(NeedsCommute ? 1 : 0));
}
else if (FusedFPOp) {
MIB.add(MI.getOperand(0));
MIB.add(MI.getOperand(3));
MIB.add(MI.getOperand(OpNum == 1 ? 2 : 1));
}
else {
MIB.add(MI.getOperand(0));
if (NeedsCommute)
Expand All @@ -1247,8 +1290,29 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
MIB.addImm(CCValid);
MIB.addImm(NeedsCommute ? CCMask ^ CCValid : CCMask);
}
if (MIB->definesRegister(SystemZ::CC) &&
(!MI.definesRegister(SystemZ::CC) ||
MI.registerDefIsDead(SystemZ::CC))) {
MIB->addRegisterDead(SystemZ::CC, TRI);
if (CCLiveRange)
CCLiveRange->createDeadDef(MISlot, LIS->getVNInfoAllocator());
}
// Constrain the register classes if converted from a vector opcode. The
// allocated regs are in an FP reg-class per previous check above.
for (const MachineOperand &MO : MIB->operands())
if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
unsigned Reg = MO.getReg();
if (MRI.getRegClass(Reg) == &SystemZ::VR32BitRegClass)
MRI.setRegClass(Reg, &SystemZ::FP32BitRegClass);
else if (MRI.getRegClass(Reg) == &SystemZ::VR64BitRegClass)
MRI.setRegClass(Reg, &SystemZ::FP64BitRegClass);
else if (MRI.getRegClass(Reg) == &SystemZ::VR128BitRegClass)
MRI.setRegClass(Reg, &SystemZ::VF128BitRegClass);
}

transferDeadCC(&MI, MIB);
transferMIFlag(&MI, MIB, MachineInstr::NoSWrap);
transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
return MIB;
}

Expand Down

0 comments on commit 57feff9

Please sign in to comment.